In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# load data
data = pd.read_csv('data/effect_tb.csv',header = None)
data.columns = ["dt","user_id","label","dmp_id"]

# dmp_id: marketing solutions id (1 = control group, 2 = marketing solution one, 3 = marketing solution three)

# delete dt since it is useless for analyzing
data = data.drop(columns = "dt")
data.head(3)

Unnamed: 0,user_id,label,dmp_id
0,1,0,1
1,1000004,0,1
2,1000004,0,2


In [3]:
# table summary
data.describe()

Unnamed: 0,user_id,label,dmp_id
count,2645958.0,2645958.0,2645958.0
mean,3112995.0,0.01456297,1.395761
std,1828262.0,0.1197952,0.692048
min,1.0,0.0,1.0
25%,1526772.0,0.0,1.0
50%,3062184.0,0.0,1.0
75%,4721132.0,0.0,2.0
max,6265402.0,1.0,3.0


In [4]:
# the shape of data
data.shape

(2645958, 3)

In [5]:
# distinct count of columns
data.nunique()

user_id    2410683
label            2
dmp_id           3
dtype: int64

In [6]:
# check the duplicated rows
data[data.duplicated(keep = False)].sort_values(by = ["user_id"])

Unnamed: 0,user_id,label,dmp_id
8529,1027,0,1
1485546,1027,0,1
1579415,1471,0,1
127827,1471,0,1
404862,2468,0,1
...,...,...,...
1382121,6264633,0,1
1382245,6264940,0,1
2575140,6264940,0,1
1382306,6265082,0,3


In [7]:
# drop duplicate rows
data = data.drop_duplicates()
# check if any duplicates left
data[data.duplicated(keep = False)]

Unnamed: 0,user_id,label,dmp_id


In [8]:
# check null values
data.info(null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2632975 entries, 0 to 2645957
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   user_id  2632975 non-null  int64
 1   label    2632975 non-null  int64
 2   dmp_id   2632975 non-null  int64
dtypes: int64(3)
memory usage: 80.4 MB


In [9]:
# check whether there is NaN
data.pivot_table(index = "dmp_id", columns = "label", values = "user_id",
                aggfunc = "count", margins = True)

label,0,1,All
dmp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1881745,23918,1905663
2,404811,6296,411107
3,307923,8282,316205
All,2594479,38496,2632975


In [10]:
# check data type
data.dtypes

user_id    int64
label      int64
dmp_id     int64
dtype: object

In [11]:
# click rate of control group
data[data["dmp_id"] == 1]["label"].mean()

0.012551012429794775

In [12]:
# sample size of campaigns
data["dmp_id"].value_counts()

1    1905663
2     411107
3     316205
Name: dmp_id, dtype: int64

In [13]:
# save the cleaned data to file
data.to_csv("data/output.csv", index = False)

In [14]:
# reload data
data = pd.read_csv("data/output.csv")

In [15]:
# click rate of groups
print("control group:" ,data[data["dmp_id"] == 1]["label"].mean())
print("marketing strategy one:" ,data[data["dmp_id"] == 2]["label"].mean())
print("marketing strategy two:" ,data[data["dmp_id"] == 3]["label"].mean())

control group: 0.012551012429794775
marketing strategy one: 0.015314747742072015
marketing strategy two: 0.026191869198779274


In [None]:
# Suppose the click rate of the control group is p1 and the one of the marketing strategy two is p2
# H0: p1 >= p2
# H1: p1 < p2
# Since 1. the sample size > 30, 2. data is independent， I chose to use z-test 

In [16]:
# the number of users
user_control = len(data[data.dmp_id == 1]) # control group
user_two = len(data[data.dmp_id == 3]) # marketing solution two

In [17]:
# the number of clicks
click_control = len(data[data.dmp_id == 1][data.label == 1])
click_two = len(data[data.dmp_id == 3][data.label == 1])

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# the rates of clicks(clicks/users)
rate_control = click_control/user_control
rate_two = click_two/user_two

In [19]:
# the rates of the sum of clicks(the sum number of clicks/the sum number of users)
rate_sum = (click_control+click_two)/(user_control+user_two)

In [23]:
# calculate the z-score
z_score = (rate_control - rate_two) / np.sqrt(rate_sum * (1 - rate_sum)*(1/user_control + 1/user_two))
z_score

-59.44168632985996

In [22]:
from scipy.stats import norm
z_alpha = norm.ppf(0.05)
z_alpha

-1.6448536269514729

In [None]:
# Since z_score = -59.4417 < z_alpha = -1.6449，we should reject the null hypothesis if z_score < z_alpha
# Conclusion： the click rate of marketing strategy two is greater than the one of the control group.