In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import plotly.graph_objects as go

import scipy.stats as scs
import statsmodels.stats.api as sms
from math import ceil

In [3]:
#get data
df = pd.read_csv('ab.csv', parse_dates=True)
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0


In [4]:
#to know data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   user_id       294478 non-null  int64         
 1   timestamp     294478 non-null  datetime64[ns]
 2   group         294478 non-null  object        
 3   landing_page  294478 non-null  object        
 4   converted     294478 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 11.2+ MB


In [7]:
pd.crosstab(df['group'], df['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


In [None]:
# Formulasi Hipotesis
H0 = new page tidak bisa menaikan jumlah cvr
H1 = new page bisa menaikan jumlah cvr

In [10]:
#statikal deskriptif
df_stats = df.groupby('group')['converted']

#std sample
std_p = lambda x: np.std(x, ddof=0)
#std erorr
std_er = lambda x: scs.sem(x, ddof=0)

df_stats = df_stats.agg([np.mean, std_p, std_er])
df_stats.columns = ['conversion_rate', 'std_dev', 'std_eror']

df_stats

Unnamed: 0_level_0,conversion_rate,std_dev,std_eror
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,0.120399,0.325428,0.000848
treatment,0.11892,0.323694,0.000843


In [17]:
#mencari cvr

cvr_control = (df_stats['conversion_rate'][0])*100
cvr_treatment = (df_stats['conversion_rate'][1])*100
cvr_diff = cvr_control-cvr_treatment

In [19]:
print(f' Conversion rate control grup {cvr_control:.2f}%, cvr treatment grup {cvr_treatment:.2f}% perbedaannya {cvr_diff:.2f}')

 Conversion rate control grup 12.04%, cvr treatment grup 11.89% perbedaannya 0.15


In [20]:
#Testing Hipotesis

from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [21]:
control_results = df[df['group'] == 'control']['converted']
treatment_results = df[df['group'] == 'treatment']['converted']
n_con = control_results.count()
n_treat = treatment_results.count()

successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'confidence interval 95% for control group: [{lower_con*100:.2f}%, {upper_con*100:.2f}%]')
print(f'confidence interval 95% for treatment group: [{lower_treat*100:.2f}%, {upper_treat*100:.2f}%]')


z statistic: 1.24
p-value: 0.216
confidence interval 95% for control group: [11.87%, 12.21%]
confidence interval 95% for treatment group: [11.73%, 12.06%]


In [22]:
# Kesimpulan

def print_conclusion(pval, alpha=0.05):
    if pval>alpha:
        print(f' p-value={pval:.3f} lebih besar dari alpha ={alpha:.3f}, hipotesis null tidak ditolak.')
    else:
        print(f'p-value={pval:.3f} lebih kecil dari alpha ={alpha:.3f}, hipotesis null ditolak.')


In [23]:
print_conclusion(pval, alpha=0.05)

 p-value=0.216 lebih besar dari alpha =0.050, hipotesis null tidak ditolak.
