In [2]:
import math

import numpy as np
import pandas as pd
from scipy import stats

In [3]:
np.random.seed(1)

In [4]:
df = pd.read_csv("Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
df.shape

(64000, 12)

In [5]:
df.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0


In [6]:
obs_rate_c = 0.5
obs_rate_t = 0.5

In [7]:
mail_df = df[df.segment != "Womens E-Mail"].reset_index(drop=True)
mail_df.shape

(42613, 12)

In [8]:
mail_df["treatment"] = (mail_df.segment == "Mens E-Mail").astype(int)
mail_df.treatment.head(3)

0    0
1    1
2    1
Name: treatment, dtype: int64

In [9]:
mail_df.groupby("treatment").conversion.describe().T

treatment,0,1
count,21306.0,21307.0
mean,0.005726,0.012531
std,0.075456,0.111241
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


In [18]:
mail_df["obs_rate_c"] = mail_df.apply(lambda x: obs_rate_c if x.history > 300 and x.recency < 6 and x.channel == "Multichannel" else 1, axis=1)
mail_df["obs_rate_t"] = mail_df.apply(lambda x: 1 if x.history > 300 and x.recency < 6 and x.channel == "Multichannel" else obs_rate_t, axis=1)
mail_df["random_number"] = np.random.rand(mail_df.shape[0])

In [24]:
bias_data = mail_df[
    ((mail_df.treatment==0)&(mail_df.random_number < obs_rate_c))|((mail_df.treatment==1)&(mail_df.random_number < obs_rate_t))
].drop("random_number", axis=1).reset_index(drop=True)

In [26]:
bias_data.groupby("treatment").conversion.describe().T

treatment,0,1
count,10590.0,10681.0
mean,0.005571,0.012078
std,0.074436,0.109237
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


In [27]:
mens_mails = bias_data[bias_data.treatment == 1]
no_mails = bias_data[bias_data.treatment == 0]

In [28]:
y0_ = no_mails.conversion.mean()
y1_ = mens_mails.conversion.mean()
y0 = (no_mails.conversion - y0_).apply(lambda x: x*x).sum()
y1 = (mens_mails.conversion - y1_).apply(lambda x: x*x).sum()

v = (y0 + y1) / (len(no_mails)+len(mens_mails)-2)
se = math.sqrt(v/len(no_mails) + v/len(mens_mails))
t = (y1_ - y0_) / se
print(f"y0: {y0}, y0_{y0_}")
print(f"y1: {y1}, y1_{y1_}")
print(f"v: {v}, se: {se}, t: {t}")

y0: 58.67129367327668, y0_0.005571293673276676
y1: 127.44199981275162, y1_0.012077520831382828
v: 0.008750448704030668, se: 0.0012827888543138074, t: 5.071939264382274


In [30]:
stats.ttest_ind(no_mails.conversion, mens_mails.conversion, equal_var=True)

Ttest_indResult(statistic=-5.071939264382274, pvalue=3.9709089672482085e-07)