## セレクションバイアスとRCT

In [26]:
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [4]:
csv_url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'
data = pd.read_csv(csv_url)

In [8]:
data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [9]:
male_df = data.query('segment != "Womens E-Mail"')

In [13]:
male_df['treatment'] = male_df.segment.apply(lambda x: 1 if x=='Mens E-Mail' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
male_df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0


In [50]:
mg = male_df.groupby(['treatment']).agg({'conversion': 'mean', 'spend': 'mean', 'treatment': 'count'})
mg

Unnamed: 0_level_0,conversion,spend,treatment
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.005726,0.652789,21306
1,0.012531,1.422617,21307


In [51]:
mg.diff()

Unnamed: 0_level_0,conversion,spend,treatment
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,
1,0.006805,0.769827,1.0


In [24]:
# 男性向けメールが配信されたグループと配信されていないグループの購買データを取得
mens_mail = male_df[male_df.treatment == 1][['spend']]
no_mail = male_df[male_df.treatment == 0][['spend']]

In [25]:
no_mail

Unnamed: 0,spend
1,0.0
14,0.0
15,0.0
20,0.0
23,0.0
...,...
63980,0.0
63981,0.0
63983,0.0
63987,0.0


In [34]:
stats.ttest_ind(mens_mail, no_mail)

Ttest_indResult(statistic=array([5.30009029]), pvalue=array([1.16320087e-07]))

P値が非常に低いため統計的に有意なものとされる

## バイアスのあるデータによる効果の検証

In [37]:
from sklearn.model_selection import train_test_split

In [35]:
biased_data = male_df.query('history > 300 or recency < 6 or channel == "Multichannel"')

In [53]:
biased_mg = biased_data.groupby(['treatment']).agg({'conversion': 'mean', 'spend': 'mean', 'treatment': 'count'})
biased_mg

Unnamed: 0_level_0,conversion,spend,treatment
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.007023,0.709173,13099
1,0.014505,1.683761,13030


In [55]:
# conversionとspendの差が大きく開いている
biased_mg.diff()

Unnamed: 0_level_0,conversion,spend,treatment
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,
1,0.007482,0.974587,-69.0


In [44]:
mens_mail_biased = biased_data[biased_data.treatment == 1][['spend']]
no_mail_biased = biased_data[biased_data.treatment == 0][['spend']]


In [45]:
stats.ttest_ind(mens_mail_biased, no_mail_biased)

Ttest_indResult(statistic=array([4.91754115]), pvalue=array([8.8169638e-07]))