In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
random.seed(42)

In [2]:
df = pd.read_csv("../data/input/ab_data.csv")
df.head(10)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
7,719014,2017-01-17 01:48:29.539573,control,old_page,0
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1


In [3]:
df.shape

(294478, 5)

In [4]:
# ユニークユーザのカウント
df["user_id"].nunique()

290584

In [5]:
# converted の平均
df.converted.mean()

0.11965919355605512

In [6]:
# 介入群が new_page に遷移していない行数
df[((df["group"] == "treatment") == (df["landing_page"] == "new_page")) == False].count()

user_id         3893
timestamp       3893
group           3893
landing_page    3893
converted       3893
dtype: int64

In [7]:
# 欠損値数
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [8]:
# 介入群!=new_page, 対照群!=old_page に対応する行インデックスを取得
i = df[((df["group"] == "treatment") == (df["landing_page"] == "new_page")) == False].index
i

Int64Index([    22,    240,    308,    327,    357,    490,    685,    713,
               776,    846,
            ...
            293817, 293888, 293894, 293917, 293996, 294014, 294200, 294252,
            294253, 294331],
           dtype='int64', length=3893)

In [9]:
# 不当な行を削除
df2 = df.drop(i)
df2[((df2["group"] == "treatment") == (df2["landing_page"] == "new_page")) == False].shape[0]

0

In [10]:
# ユニークユーザの数
df2["user_id"].nunique()

290584

In [11]:
# user_id の重複
df2[df2.duplicated(["user_id"], keep=False)]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [12]:
# 重複行削除
df2.drop_duplicates(subset="user_id", keep="first", inplace=True)

In [13]:
# convert=1 の割合
(df2.query("converted == 1").converted.count()) / df2.shape[0]

0.11959708724499628

In [14]:
# 介入群のコンバージョン率
control_df = df2.query("group == 'control'")
Pold = control_df["converted"].mean()
Pold

0.1203863045004612

In [15]:
# 対照群のコンバージョン率
treatment_df = df2.query("group == 'treatment'")
Pnew = treatment_df["converted"].mean()
Pnew

0.11880806551510564

In [16]:
# 介入群の割合
df2.query("landing_page == 'new_page'").landing_page.count() / df2.shape[0]

0.5000619442226688

In [17]:
# クロス集計
crossed = pd.crosstab(df2.group, df2.converted)
crossed

converted,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,127785,17489
treatment,128046,17264


In [18]:
# $\Chi^2$-検定
from scipy.stats import chi2_contingency
x2, p, dof, expected = chi2_contingency(crossed)

In [19]:
print("$Chi^2$-値: {}".format(x2))
print("確率: {}".format(p))
print("自由度: {}".format(dof))
print("expected: {}".format(expected))

$Chi^2$-値: 1.7035660051885058
確率: 0.19182228096235662
自由度: 1
expected: [[127899.65274757  17374.34725243]
 [127931.34725243  17378.65274757]]


In [20]:
if p < 0.05:
    print("有意差があります")
else:
    print("有意差がありません")

有意差がありません
