In [1]:
import numpy as np
import pandas as pd

#widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display

#plots
import matplotlib.pyplot as plt

#stats
import scipy as sp
import statsmodels as sm

import statsmodels.stats.api as sms
import scipy.stats as st


#silence warnings on kaggle
import warnings
warnings.filterwarnings("ignore")

In [2]:
path =r'C:\Users\vsekar\Desktop\Kaiser AI Garage AAI_S&A_A&C\Tasks\Dataset\AB_Test\ab_data.csv'

In [3]:
data =pd.read_csv(path)
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['NameofDay'] = ((pd.DatetimeIndex(data.timestamp).day_name()))

In [4]:
data['WEEKDAY'] = ((pd.DatetimeIndex(data.timestamp).dayofweek) // 5 == 1).astype(int)

In [5]:
data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,NameofDay,WEEKDAY
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,Saturday,1
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,Thursday,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,Wednesday,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,Sunday,1
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,Saturday,1


In [6]:
data.shape

(294478, 7)

In [7]:
List =['Wednesday','Saturday']
df_filtered = data[data['NameofDay'].isin(List)]

In [8]:
df =df_filtered.copy()

In [9]:
# Where treatment does not match with new_page or control does not match with old_page
i = df[((df['group']=='treatment') ==(df['WEEKDAY']==1)) == False].index

In [10]:
# Drop those rows
df2 = df.drop(i)

In [11]:
#The number of unique users in the dataset.
df2['user_id'].nunique()

39986

In [12]:
# Number of duplicate rows
df2[df2.duplicated(['user_id'], keep=False)]
# Drop the row and keep the first row
df2.drop_duplicates(subset ='user_id',keep ='first',inplace = True)

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39986 entries, 11 to 294470
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   user_id       39986 non-null  int64         
 1   timestamp     39986 non-null  datetime64[ns]
 2   group         39986 non-null  object        
 3   landing_page  39986 non-null  object        
 4   converted     39986 non-null  int64         
 5   NameofDay     39986 non-null  object        
 6   WEEKDAY       39986 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(2), object(3)
memory usage: 2.3+ MB


In [14]:
# proportion of users converted
converted = round(df2.query('converted == 1').user_id.nunique() / df2.user_id.nunique()*100, 2 )
print('The number of users converted across two groups {} %'.format(converted))

The number of users converted across two groups 11.94 %


In [15]:
# proportion of users not converted
Not_converted = round(df2.query('converted == 0').user_id.nunique() / df2.user_id.nunique()*100,2)
print('The number of users Not converted across two groups {} %'.format(Not_converted))

The number of users Not converted across two groups 88.06 %


In [16]:
df3 = df2[['user_id', 'group', 'WEEKDAY','converted']]

In [17]:
df_sample =df3.sample(n=30,axis=0, random_state= 42,replace = True)

In [18]:
df_sample.shape

(30, 4)

In [19]:
df_sample.groupby('group')[['user_id','converted']].agg({'user_id' : 'count', 'converted':['sum']}).reset_index().rename(columns={'sum':'Total_Conversions','count':'Total_Users'})

Unnamed: 0_level_0,group,user_id,converted
Unnamed: 0_level_1,Unnamed: 1_level_1,Total_Users,Total_Conversions
0,control,15,1
1,treatment,15,2


In [20]:
#Show the % split between users who saw new vs old page
#Calculate pooled probability
mask = (df_sample["group"] == 'control')
conversions_control = df_sample["converted"][mask].sum()
total_users_control = df_sample["converted"][mask].count()

mask100 = (df_sample["group"] == 'treatment')
conversions_treatment = df_sample["converted"][mask100].sum()
total_users_treatment = df_sample["converted"][mask100].count()

In [21]:
#control
conversions_control, total_users_control

(1, 15)

In [22]:
#treatment
conversions_treatment,total_users_treatment

(2, 15)

In [23]:
print("Split of USERS in control  who got call in weedend vs Experiment group who got call in weekdays: ", 
          round(total_users_control / df2["converted"].count() * 100, 2), "% ",
          round((total_users_treatment / df2["converted"].count()) * 100, 2), "%")

Split of USERS in control  who got call in weedend vs Experiment group who got call in weekdays:  0.04 %  0.04 %


In [24]:
#count number of users who converted in each group
print("Number of control users who converted on old page: ", conversions_control)
print("Percentage of control users who converted: ", round((conversions_control / total_users_control) * 100, 2), "%")


print("Number of treatment users who converted on new page: ", conversions_treatment)
print("Percentage of treatment users who converted: ", round((conversions_treatment/ total_users_treatment) * 100, 2), "%")

print('The total number of users in control group',total_users_control)
print('The total number of users in tratement group',total_users_treatment)
print("Difference between the users in both group is",(-total_users_control+total_users_treatment))

Number of control users who converted on old page:  1
Percentage of control users who converted:  6.67 %
Number of treatment users who converted on new page:  2
Percentage of treatment users who converted:  13.33 %
The total number of users in control group 15
The total number of users in tratement group 15
Difference between the users in both group is 0


In [25]:
control_results = df_sample[df_sample['WEEKDAY'] == 1]['converted']
treatment_results = df_sample[df_sample['WEEKDAY'] == 0]['converted']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

In [26]:
successes

[2, 1]

In [27]:
nobs

[15, 15]

In [28]:
import scipy.stats as ss 

In [29]:
t_stat, p_val= ss.ttest_ind(successes,nobs)
t_stat , p_val

(-27.0, 0.001368926035332657)

In [30]:
if p_val<0.05:
    print('The experiment has made an imp.Experiment performed better than control. It is statistically and practically significant')
else:
    print('It is not significant.Control is better than the Experiment. Hence, No need to make changes')  

The experiment has made an imp.Experiment performed better than control. It is statistically and practically significant
