In [3]:
import math as mt

In [2]:
import numpy as np

In [4]:
import pandas as pd
from scipy.stats import norm

In [5]:
baseline = {"Cookies":40000,"Clicks":3200,"Enrollments":660,"CTP":0.08,"GConversion":0.20625,
           "Retention":0.53,"NConversion":0.109313}

In [9]:
# we need to scale our collected counts estimates of metrics with the sample size 
baseline["Cookies"] = 5000

In [8]:
baseline["Clicks"] = baseline["Clicks"] * (5000 / 40000)

In [11]:
baseline["Enrollments"] = baseline["Enrollments"] * (5000 / 40000)

In [12]:
baseline

{'Cookies': 5000,
 'Clicks': 400.0,
 'Enrollments': 82.5,
 'CTP': 0.08,
 'GConversion': 0.20625,
 'Retention': 0.53,
 'NConversion': 0.109313,
 'cookies': 5000}

In [16]:
# To get the p and n needed for Gross Conversion (GC)
# and compute the Stansard Deviation(sd) rounded to 5 decimal digits.
GC = {}
GC["d_min"] = 0.01
GC["p"] = baseline["GConversion"]

GC["n"] = baseline["Clicks"]
GC["sd"] = round(mt.sqrt((GC["p"]*(1-GC["p"]))/GC["n"]),5)
GC["sd"]

0.02023

In [17]:
# To get the p and n we needed for Retention(R)
# and compute the Stansard Deviation(sd) rounded to 5 decimal digits.
R = {}
R["d_min"] = 0.01
R["p"] = baseline["Retention"]
R["n"] = baseline["Enrollments"]
R["sd"] = round(mt.sqrt((R["p"]*(1-R["p"]))/R["n"]),5)
R["sd"]

0.05495

In [18]:
# To get the p and n we needed for Net Conversion (NC)
# and compute the Standard Deviation (sd) rounded to 5 decimal digits.
NC = {}
NC["d_min"] = 0.0075
NC["p"] = baseline["NConversion"]
NC["n"] = baseline["Clicks"]
NC["sd"] = round(mt.sqrt((NC["p"]*(1-NC["p"]))/NC["n"]),5)
NC["sd"]

0.0156

In [19]:
def get_sds(p,d):
    sd1 = mt.sqrt(2*p*(1-p))
    sd2 = mt.sqrt(p*(1-p)+(p+d)*(1-(p+d)))
    x = [sd1,sd2]
    return x

In [22]:
def get_z_score(alpha):
    return norm.ppf(alpha)

def get_sds(p,d):
    sd1 = mt.sqrt(2*p*(1-p))
    sd2 = mt.sqrt(p*(1-p)+(p+d)*(1-(p+d)))
    sds = [sd1,sd2]
    return sds

def get_sampleSize(sds,alpha,beta,d):
    n = pow((get_z_score(1-alpha/2)*sds[0]+get_z_score(1-beta)*sds[1]),2)/pow(d,2)
    return n

In [23]:
GC["d"] = 0.01
R["d"] = 0.01
NC["d"] = 0.0075

In [26]:
#Gross Conversion
GC["SampleSize"] = round(get_sampleSize(get_sds(GC["p"],GC["d"]),0.05,0.2,GC["d"]))
GC["SampleSize"]

25835.0

In [27]:
#This means we need at least 25,835 cookies who click the Free Trial button - per group! That means that if we got 400 clicks out of 5000 pageviews (400/5000 = 0.08)
GC["SampleSize"] = round(GC["SampleSize"]/0.08*2)
GC["SampleSize"]

645875.0

In [28]:
#Retention
R["SampleSize"] = round(get_sampleSize(get_sds(R["p"],R["d"]),0.05,0.2,R["d"]))
R["SampleSize"]

39087.0

In [31]:
#This means that we need 39,087 users who enrolled per group! We have to first convert this to cookies who clicked, and then to cookies who viewed the page, then finally to multipky by two for both groups.
R["SampleSize"]=round(R["SampleSize"]/0.08/0.20625*2)
R["SampleSize"]

4737818.0

In [32]:
#Net Conversion
NC["SampleSize"] = round(get_sampSize(get_sds(NC["p"],NC["d"]),0.05,0.2,NC["d"]))
NC["SampleSize"]

27413.0

In [34]:
NC["SampleSize"] = NC["SampleSize"]/0.08*2
NC["SampleSize"]

685325.0

In [49]:
control = pd.read_excel(r"C:\Users\User\Downloads\UdacityABtesting.xlsx")
experiment = pd.read_excel(r"C:\Users\User\Downloads\UdacityABtestingE.xlsx")
control.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7723,687,134.0,70.0
1,"Sun, Oct 12",9102,779,147.0,70.0
2,"Mon, Oct 13",10511,909,167.0,95.0
3,"Tue, Oct 14",9871,836,156.0,105.0
4,"Wed, Oct 15",10014,837,163.0,64.0


In [50]:
#Sanity Checks for differences between counts 

pageviews_contrl = pd.DataFrame(control, columns= ['Pageviews']).sum()
pageviews_expr = pd.DataFrame(experiment, columns= ['Pageviews']).sum()

pageviews_total = pageviews_contrl + pageviews_expr
print ("Number of Pageviews in control:", pageviews_contrl)
print ("Number of Pageviews in experiment:", pageviews_expr)


Number of Pageviews in control: Pageviews    345543
dtype: int64
Number of Pageviews in experiment: Pageviews    344660
dtype: int64


In [53]:
p = 0.5
alpha = 0.05
p_hat = round(pageviews_contrl / (pageviews_total),4)
sd = mt.sqrt(p*(1-p) / (pageviews_total))
ME = round(get_z_score(1-(alpha/2))*sd,4)

p_ME = p + ME
p_Me = p - ME

print ("The confidence interval is between",p_Me,"and",p_ME,"; Is",p_hat,"inside this range?")

The confidence interval is between 0.4988 and 0.5012 ; Is Pageviews    0.5006
dtype: float64 inside this range?


In [54]:
# Number of cookies who clicked the Free Trial Button
clicks_contrl = control['Clicks'].sum()
clicks_expr = experiment['Clicks'].sum()
clicks_total = clicks_contrl + clicks_expr

p_hat = round(clicks_contrl / clicks_total,4)
sd = mt.sqrt(p*(1-p) / clicks_total)
ME = round(get_z_score(1-(alpha/2))*sd,4)

print ("The confidence interval is between",p-ME,"and",p+ME,"; Is",p_hat,"inside this range?")

The confidence interval is between 0.4959 and 0.5041 ; Is 0.5005 inside this range?


In [58]:
# Sanity Checks for differences between probabilities
# CTP of the Free Trial Button

ctp_cont = clicks_contrl / pageviews_contrl
ctp_exp = clicks_expr / pageviews_expr
d_hat = round(ctp_exp-ctp_cont,4)
p_pooled = clicks_total / pageviews_total
sd_pooled = mt.sqrt(p_pooled*(1-p_pooled) * (1/pageviews_contrl + 1/pageviews_expr))
ME = round(get_z_score(1-(alpha/2)) * sd_pooled,4)
print ("The confidence interval is between",0-ME,"and",0+ME,"; Is",d_hat,"within this range?")

The confidence interval is between -0.0013 and 0.0013 ; Is Pageviews    0.0001
dtype: float64 within this range?


In [60]:
clicks_contrl = control["Clicks"].loc[control["Enrollments"].notnull()].sum()
clicks_expr = experiment["Clicks"].loc[experiment["Enrollments"].notnull()].sum()

print(clicks_contrl)
print(clicks_expr)

17293
17260


In [62]:
#Gross Conversion - number of enrollments divided by number of clicks
enrollments_contrl = control["Enrollments"].sum()
enrollments_expr = experiment["Enrollments"].sum()

GC_contrl = enrollments_contrl / clicks_contrl
GC_expr = enrollments_expr / clicks_expr
GC_pooled = (enrollments_contrl + enrollments_expr) / (clicks_contrl + clicks_expr)
GC_sd_pooled = mt.sqrt(GC_pooled*(1-GC_pooled) * (1/clicks_contrl + 1/clicks_expr))
GC_ME = round(get_z_score(1-alpha/2) * GC_sd_pooled,4)
GC_diff = round(GC_expr - GC_contrl,4)

exp_Change = GC_diff * 100

print("The change due to the experiment is",exp_Change,"%")
print("Confidence Interval: [",GC_diff-GC_ME,",",GC_diff+GC_ME,"]")
print ("The change is statistically significant if the CI doesn't include 0. In that case, it is practically significant if",-GC["d_min"],"is not in the CI as well.")

The change due to the experiment is -2.06 %
Confidence Interval: [ -0.0292 , -0.012 ]
The change is statistically significant if the CI doesn't include 0. In that case, it is practically significant if -0.01 is not in the CI as well.


In [63]:
#Net Conversion - number of payments divided by number of clicks
payments_contrl = control["Payments"].sum()
payments_expr = experiment["Payments"].sum()

NC_contrl = payments_contrl / clicks_contrl
NC_expr = payments_expr / clicks_expr
NC_pooled=(payments_contrl + payments_expr) / (clicks_contrl + clicks_expr)
NC_sd_pooled=mt.sqrt(NC_pooled*(1-NC_pooled) * (1/clicks_contrl + 1/clicks_expr))
NC_ME=round(get_z_score(1-alpha/2)*NC_sd_pooled,4)
NC_diff=round(NC_expr - NC_contrl,4)

print("The change due to the experiment is",NC_diff*100,"%")
print("Confidence Interval: [",NC_diff-NC_ME,",",NC_diff+NC_ME,"]")
print ("The change is statistically significant if the CI doesn't include 0. In that case, it is practically significant if",NC["d_min"],"is not in the CI as well.")

The change due to the experiment is -0.49 %
Confidence Interval: [ -0.0116 , 0.0018000000000000004 ]
The change is statistically significant if the CI doesn't include 0. In that case, it is practically significant if 0.0075 is not in the CI as well.


In [64]:
full=control.join(other=experiment,how="inner",lsuffix="_cont",rsuffix="_exp")

full.count()

Date_cont           37
Pageviews_cont      37
Clicks_cont         37
Enrollments_cont    23
Payments_cont       23
Date_exp            37
Pageviews_exp       37
Clicks_exp          37
Enrollments_exp     23
Payments_exp        23
dtype: int64

In [65]:
full=full.loc[full["Enrollments_cont"].notnull()]
full.count()

Date_cont           23
Pageviews_cont      23
Clicks_cont         23
Enrollments_cont    23
Payments_cont       23
Date_exp            23
Pageviews_exp       23
Clicks_exp          23
Enrollments_exp     23
Payments_exp        23
dtype: int64

In [66]:
a = full['Enrollments_cont'] / full['Clicks_cont']
b = full['Enrollments_exp'] / full['Clicks_exp']
full['GC'] = np.where(a<b,1,0)

# net conversion
x = full['Payments_cont'] / full['Clicks_cont']
y = full['Payments_exp'] / full['Clicks_exp']
full['NC'] = np.where(x<y,1,0)
full.head()

Unnamed: 0,Date_cont,Pageviews_cont,Clicks_cont,Enrollments_cont,Payments_cont,Date_exp,Pageviews_exp,Clicks_exp,Enrollments_exp,Payments_exp,GC,NC
0,"Sat, Oct 11",7723,687,134.0,70.0,"Sat, Oct 11",7716,686,105.0,34.0,0,0
1,"Sun, Oct 12",9102,779,147.0,70.0,"Sun, Oct 12",9288,785,116.0,91.0,0,1
2,"Mon, Oct 13",10511,909,167.0,95.0,"Mon, Oct 13",10480,884,145.0,79.0,0,0
3,"Tue, Oct 14",9871,836,156.0,105.0,"Tue, Oct 14",9867,827,138.0,92.0,0,0
4,"Wed, Oct 15",10014,837,163.0,64.0,"Wed, Oct 15",9793,832,140.0,94.0,0,1


In [68]:
GC_a = full.GC[full["GC"]==1].count()
NC_a = full.NC[full["NC"]==1].count()
n = full.NC.count()
print("No. of cases for GC:",GC_a,'\n',
      "No. of cases for NC:",NC_a,'\n',
      "No. of total cases",n)

No. of cases for GC: 4 
 No. of cases for NC: 10 
 No. of total cases 23
