In [2]:
import pandas as pd
import numpy as np
import sweetviz as sv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df=pd.read_csv("ab_data.csv")

In [4]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [5]:
# Generate the report
report = sv.analyze(df)

# Display the report
report.show_html('sweetviz_report.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


User_id-> There are no missing values but there are ~1% duplicates. 
    Remove duplicates
Converted is imbalanced. We need to look at Proportions.
Groups are equally distributed with a 50-50 split. Treatment and control groups size is same

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [7]:
df[df["user_id"].duplicated()]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
2656,698120,2017-01-15 17:13:42.602796,control,old_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0
7500,899953,2017-01-07 03:06:54.068237,control,new_page,0
8036,790934,2017-01-19 08:32:20.329057,treatment,new_page,0
10218,633793,2017-01-17 00:16:00.746561,treatment,old_page,0
...,...,...,...,...,...
294308,905197,2017-01-03 06:56:47.488231,treatment,new_page,0
294309,787083,2017-01-17 00:15:20.950723,control,old_page,0
294328,641570,2017-01-09 21:59:27.695711,control,old_page,0
294331,689637,2017-01-13 11:34:28.339532,control,new_page,0


There are 3894 duplicates, which we ned to remove

In [8]:
df[df["user_id"]==698120]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
988,698120,2017-01-22 07:09:37.540970,control,new_page,0
2656,698120,2017-01-15 17:13:42.602796,control,old_page,0


In [9]:
# Check if there is mismatch between group and landing_page
df_mismatch = df[(df["group"]=="treatment")&(df["landing_page"]=="old_page")
                |(df["group"]=="control")&(df["landing_page"]=="new_page")]
n_mismatch = df_mismatch.shape[0]
print(f"mismatched rows:{n_mismatch} " )
print("Percent of mismatched rows:%.2f%%" % (n_mismatch/df.shape[0]*100))

mismatched rows:3893 
Percent of mismatched rows:1.32%


We need to drop this incorrect data.

In [28]:
df2 = df[(df["group"]=="treatment")&(df["landing_page"]=="new_page") |(df["group"]=="control")&(df["landing_page"]=="old_page")] #taking only correct data

In [29]:
df2.drop_duplicates(subset="user_id",inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(subset="user_id",inplace=True)


In [30]:
df2.shape

(290584, 5)

All the duplicated were from incorrectly labeled groups, when we removed incorrect groups we ended up removing duplicates too.

We can also look at absolute numbers to see if there is a difference between two groups. Usually when the split is not equal, we look at proportions like conversion rate, entitlement rates etc. 

T-Test: For comparing means between two groups (small samples or unknown variance).

Z-Test: For comparing means between two groups (large samples or known variance).

Fisher's Exact Test: For small samples in a 2x2 contingency table. (categorical variables)

Chi-Square Test: For larger samples with categorical data in contingency tables.

Deciding which test to use.
This dataset is large and has categoritcal variable - converted(1) or not converted(0), so Chi-square test makes sense. 

Null Hypothesis: There is no difference between means of control and treament groups of 'converted' class

Alternative hypothesis: There is difference between  mean of control and treament groups of 'converted' class

In [31]:
#Check if the data is normally distributed. We can use Shapiro- Wilk test to find it out. 
df2.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [32]:
def Fun_contingence(df,group_col,metric,is_rate=False):
    """
    Parameters
    ----------
        df: pd.DataFrame
            DataFrame to perform AB testing on
        group_col : String
            The column name of the group
        metric: String
            The column name of the metric
        is_rate: bool
            Conditional to do rates over counts
    """
    groups = df[group_col].unique()
    group_count_columns = [f'{group}_count' for group in groups]
    if is_rate:
        group_count_columns = [f'{group}_rate' for group in groups]
    
    chi_df = pd.DataFrame(columns=group_count_columns)
    for idx, group in enumerate(groups):
        print(df[df[group_col] == group][metric].sum())
        chi_df[group_count_columns[idx]] = df[df[group_col] == group][metric].value_counts()
        print(chi_df[group_count_columns[idx]])
    return chi_df

chi_df = Fun_contingence(df2, 'group', 'converted')

17489
converted
0    127785
1     17489
Name: control_count, dtype: int64
17264
converted
0    128046
1     17264
Name: treatment_count, dtype: int64


In [33]:
# Perform Chi-Square Test of Independence
from scipy.stats import chi2_contingency
chi2_stat, p_val, dof, ex = chi2_contingency(chi_df) 

In [34]:
chi2_stat, p_val, dof, ex 

(1.7035660051885055,
 0.1918222809623566,
 1,
 array([[127899.65274757, 127931.34725243],
        [ 17374.34725243,  17378.65274757]]))

chi2_stat is 1.7

p_value is 0.19 which is >0.05, so we fail to reject the null hypothesis. Hence there is no difference between conversion rates of control and treatment groups. 

Expected Frequency (ex) : These are the frequencies we would expect if there were no association between the groups and the conversion rates.