In [4]:
# DATASET TAKEN FROM https://www.kaggle.com/shweta112/a-b-testing-analysis

df=pd. read_csv('/kaggle/input/ab-testing/ab_data.csv')
df.head()

In [3]:
import pandas as pd
import numpy as np

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
df.head()

In [6]:
df.info()
#294477 entries are present ( check for duplicates)

In [7]:
df.shape

In [8]:
df.size

In [9]:
pd.crosstab(df['group'],df['landing_page'])


check for duplicate values 

# have the ability to remove duplicates from dataset before going to consider sampling

In [10]:
df['user_id'].unique()
# unique values are displayed here

In [11]:
session_count=df['user_id'].value_counts(ascending=False)
print(session_count.head(20))
# gives idea of how many times the given user-id is present

In [12]:

multi_users = session_count[session_count > 1].count()

print(f'There are {multi_users} users that appear multiple times in the dataset')

# could analyze the count of value_count that are greater than 1

In [13]:
# since 3894 users are repeated, we can remove those counts before going ahead 
# for sampling
users_to_drop=session_count[session_count >1].index
df1=df[~df['user_id'].isin(users_to_drop)]

print(f'The updated dataset now has {df1.shape[0]} entries')



Since dataset is removed from duplicate values, let's step into Sampling 
considering simple random sampling 

In [16]:
control_sample = df1[df1['group'] == 'control'].sample(n=required_n,random_state=22)
treatment_sample=df1[df1['group']=='treatment'].sample(n=required_n,random_state=22)


In [17]:
control_sample.head()

In [18]:
ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)

In [19]:
ab_test

Step 1- Design hypothesis

In [14]:
# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

%matplotlib inline

# Some plot styling preferences
plt.style.use('seaborn-whitegrid')
font = {'family' : 'Helvetica',
        'weight' : 'bold',
        'size'   : 14}

mpl.rc('font', **font)

In [15]:
effect_size = sms.proportion_effectsize(0.13, 0.15) # Calculating effect size based on our expected rates

required_n = sms.NormalIndPower().solve_power(
    effect_size, 
    power=0.8, 
    alpha=0.05, 
    ratio=1
    )                              # Calculating sample size needed

required_n = ceil(required_n)     # Rounding up to next whole number                          

print(required_n)

3. Visualising the results


Calculate the converted points and visualize for both control and treatment variant

In [20]:
conversion_rates=ab_test.groupby('group')['converted']

std_p= lambda x:np.std(x,ddof=0)     # Std. deviation of the proportion
se_p= lambda x:stats.sem(x, ddof=0)  # Std. error of the proportion



In [21]:
conversion_rates=conversion_rates.agg([np.mean,std_p,se_p])
conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']

conversion_rates.style.format('{:.3f}')



Judging by the stats above, it does look like our two designs performed very similarly, with our new design performing slightly better, approx. 12.3% vs. 12.6% conversion rate.

In [27]:
plt.figure(figsize=(8,6))
sns.barplot(x=ab_test['group'],y=ab_test['converted'],ci=False)

plt.ylim(0,0.17)
plt.title('Conversion rate by group')
plt.xlabel('Group')
plt.ylabel('Converted(proportion)')

The conversion rates for our groups are indeed very close. Also note that the conversion rate of the control group is lower than what we would have expected given what we knew about our avg. conversion rate (12.3% vs. 13%). This goes to show that there is some variation in results when sampling from a population.

So... the treatment group's value is higher. Is this difference statistically significant?

4- Test the hypothesis
 Since we have a very large sample, we can use the normal approximation for calculating our -value (i.e. z-test).

In [28]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint


In [29]:
control_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']

In [32]:
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')


CONCLUSION

# here p-value is 0.732, a lot greater than alpha value (0.05%). Hence we cannot reject null hypothesis.

Additionally, if we look at the confidence interval for the treatment group ([0.116, 0.135], i.e. 11.6-13.5%) we notice that:

It includes our baseline value of 13% conversion rate
It does not include our target value of 15% (the 2% uplift we were aiming for)
What this means is that it is more likely that the true conversion rate of the new design is similar to our baseline, rather than the 15% target we had hoped for. This is further proof that our new design is not likely to be an improvement on our old design, and that unfortunately we are back to the drawing board!