In [93]:
import pandas as pd
import numpy as np
import math

In [2]:
data = pd.read_csv('data/Final Project Baseline Values - Sheet1.csv', header = None)
data.head()

Unnamed: 0,0,1
0,Unique cookies to view course overview page pe...,40000.0
1,"Unique cookies to click ""Start free trial"" per...",3200.0
2,Enrollments per day:,660.0
3,"Click-through-probability on ""Start free trial"":",0.08
4,"Probability of enrolling, given click:",0.20625


In [3]:
data.columns = ['metrics', 'values']

In [4]:
data['metrics'] = ['unique cookies to view page/day', 'unique cookies to click "Start trial"/per day', 'enrollments/day',
                  'ctp on star trial', 'gross conversion', 'retention', 'net conversion']
data['dmin'] = [3000, 50, 240, 0.01, 0.01, 0.01, 0.0075]
data

Unnamed: 0,metrics,values,dmin
0,unique cookies to view page/day,40000.0,3000.0
1,"unique cookies to click ""Start trial""/per day",3200.0,50.0
2,enrollments/day,660.0,240.0
3,ctp on star trial,0.08,0.01
4,gross conversion,0.20625,0.01
5,retention,0.53,0.01
6,net conversion,0.109313,0.0075


In [5]:
data['sample'] = np.nan
f = 5000 / 40000
for i in range(3):
    data.loc[i,'sample'] = data.loc[i, 'values'] * f
data

Unnamed: 0,metrics,values,dmin,sample
0,unique cookies to view page/day,40000.0,3000.0,5000.0
1,"unique cookies to click ""Start trial""/per day",3200.0,50.0,400.0
2,enrollments/day,660.0,240.0,82.5
3,ctp on star trial,0.08,0.01,
4,gross conversion,0.20625,0.01,
5,retention,0.53,0.01,
6,net conversion,0.109313,0.0075,


## Testing Normality

In [6]:
data.loc[1, 'values']

3200.0

In [7]:
def normality(p, n, metric):
    if p * n > 10 and n * (1-p) > 10:
        print(metric + ': normal distribution assumption is valid')
    else:
        print(metric + ': normal distribution assumption is not valid')
for i, j in zip([1,2,1], [4,5,6]):
    normality(data.loc[j, 'values'],data.loc[i, 'sample'], data.loc[j, 'metrics'] )

gross conversion: normal distribution assumption is valid
retention: normal distribution assumption is valid
net conversion: normal distribution assumption is valid


## Measuring Variability

In [8]:
def std(p,n, metric):
    sd = round((p * (1-p)/n) ** 0.5 , 4)
    print(metric + ' standard deviation:',sd )
    return sd
    
data['sd'] = np.nan
for i,j in zip([1,2,1], [4,5,6]):
    data.loc[j, 'sd'] = std(data.loc[j, 'values'], data.loc[i, 'sample'], data.loc[j, 'metrics'])
data

gross conversion standard deviation: 0.0202
retention standard deviation: 0.0549
net conversion standard deviation: 0.0156


Unnamed: 0,metrics,values,dmin,sample,sd
0,unique cookies to view page/day,40000.0,3000.0,5000.0,
1,"unique cookies to click ""Start trial""/per day",3200.0,50.0,400.0,
2,enrollments/day,660.0,240.0,82.5,
3,ctp on star trial,0.08,0.01,,
4,gross conversion,0.20625,0.01,,0.0202
5,retention,0.53,0.01,,0.0549
6,net conversion,0.109313,0.0075,,0.0156


In [17]:
control = pd.read_excel('Final Project Results.xlsx', 'Control')
experiment = pd.read_excel('Final Project Results.xlsx','Experiment')

In [47]:
# create a new dataframe
sanity_check = pd.DataFrame(index = ['Pageviews','Clicks','CTP'], 
                            columns=['CI_lower','CI_upper','obs','pass',])
sanity_check.loc['Pageviews', 'obs'] =  control['Pageviews'].sum()/(control['Pageviews'].sum()+experiment['Pageviews'].sum())
sanity_check.loc['Clicks', 'obs'] =  control['Clicks'].sum()/(control['Clicks'].sum()+experiment['Clicks'].sum())
sanity_check

Unnamed: 0,CI_lower,CI_upper,obs,pass
Pageviews,,,0.50064,
Clicks,,,0.500467,
CTP,,,,


## Cookies


In [48]:
std = ((0.5 * 0.5) / (control['Pageviews'].sum() + experiment['Pageviews'].sum())) ** 0.5
margin_of_error = std * 1.96 # 95 % confidence interval
sanity_check.loc['Pageviews', 'CI_lower'] = sanity_check.loc['Pageviews', 'obs'] - margin_of_error
sanity_check.loc['Pageviews', 'CI_upper'] = sanity_check.loc['Pageviews', 'obs'] + margin_of_error
if sanity_check.loc['Pageviews', 'CI_lower'] <= sanity_check.loc['Pageviews', 'obs'] <= sanity_check.loc['Pageviews', 'CI_upper']:
    sanity_check.loc['Pageviews', 'pass'] = 1
else:
    sanity_check.loc['Pageviews', 'pass'] = 0

sanity_check 

Unnamed: 0,CI_lower,CI_upper,obs,pass
Pageviews,0.49946,0.501819,0.50064,1.0
Clicks,,,0.500467,
CTP,,,,


In [49]:
std = ((0.5 * 0.5) / (control['Clicks'].sum() + experiment['Clicks'].sum())) ** 0.5
margin_of_error = std * 1.96 # 95 % confidence interval
sanity_check.loc['Clicks', 'CI_lower'] = sanity_check.loc['Clicks', 'obs'] - margin_of_error
sanity_check.loc['Clicks', 'CI_upper'] = sanity_check.loc['Clicks', 'obs'] + margin_of_error
if sanity_check.loc['Clicks', 'CI_lower'] <= sanity_check.loc['Clicks', 'obs'] <= sanity_check.loc['Clicks', 'CI_upper']:
    sanity_check.loc['Clicks', 'pass'] = 1
else:
    sanity_check.loc['Clicks', 'pass'] = 0

sanity_check 

Unnamed: 0,CI_lower,CI_upper,obs,pass
Pageviews,0.49946,0.501819,0.50064,1.0
Clicks,0.496352,0.504583,0.500467,1.0
CTP,,,,


In [51]:
sanity_check.loc['CTP', 'obs'] = (control['Clicks'].sum() / control['Pageviews'].sum()) - (experiment['Clicks'].sum() / experiment['Pageviews'].sum())

In [52]:
sanity_check

Unnamed: 0,CI_lower,CI_upper,obs,pass
Pageviews,0.49946,0.501819,0.50064,1.0
Clicks,0.496352,0.504583,0.500467,1.0
CTP,,,-5.7e-05,


In [56]:
pooled_prob = (control['Clicks'].sum() + experiment['Clicks'].sum()) / (experiment['Pageviews'].sum() + control['Pageviews'].sum())
a = 1 / experiment['Pageviews'].sum()
b = 1 / control['Pageviews'].sum()
std = (pooled_prob * (1 - pooled_prob) * (a + b)) ** 0.5
margin_of_error = 1.96 * std
sanity_check.loc['CTP', 'CI_lower'] = sanity_check.loc['CTP', 'obs'] - margin_of_error
sanity_check.loc['CTP', 'CI_upper'] = sanity_check.loc['CTP', 'obs'] + margin_of_error
if sanity_check.loc['CTP', 'CI_lower'] <= sanity_check.loc['CTP', 'obs'] <= sanity_check.loc['CTP', 'CI_upper']:
    sanity_check.loc['CTP', 'pass'] = 1
else:
    sanity_check.loc['CTP', 'pass'] = 0
    
sanity_check

Unnamed: 0,CI_lower,CI_upper,obs,pass
Pageviews,0.49946,0.501819,0.50064,1
Clicks,0.496352,0.504583,0.500467,1
CTP,-0.001352,0.001239,-5.7e-05,1


## Statistical Significance

In [64]:
statistical_significance = pd.DataFrame(columns = ['dmin', 'observed', 'lower_bound', 'upper_bound', 'statistical_significance'],
                                       index = ['Gross Conversion', 'Net Conversion'])
statistical_significance.dmin = (0.01, 0.0075)

Unnamed: 0,dmin,observed,lower_bound,upper_bound,statistical_significance
Gross Conversion,0.01,,,,
Net Conversion,0.0075,,,,


In [71]:
control_valid = control[:23]
experiment_valid = experiment[:23]
control_gross_conversion = control_valid['Enrollments'].sum() / control_valid['Clicks'].sum()
experiment_gross_conversion = experiment_valid['Enrollments'].sum() / experiment_valid['Clicks'].sum()
diff_observed = experiment_gross_conversion - control_gross_conversion
pooled_prob = control_gross_conversion + experiment_gross_conversion
a = 1 / control_valid['Clicks'].sum()
b = 1 / experiment_valid['Clicks'].sum()
std = ((a + b) * (pooled_prob) * (1 - pooled_prob)) ** 0.5
margin_of_error = 1.96 * std
lower_bound = -margin_of_error
upper_bound = margin_of_error
statistical_significance.loc['Gross Conversion', 'observed'] = diff_observed
statistical_significance.loc['Gross Conversion', 'lower_bound'] = lower_bound
statistical_significance.loc['Gross Conversion', 'upper_bound'] = upper_bound
if abs(diff_observed) > statistical_significance.loc['Gross Conversion', 'dmin']:
    if lower_bound <= diff_observed <= upper_bound:
        statistical_significance.loc['Gross Conversion', 'statistical_significance'] = 0
    else:
        statistical_significance.loc['Gross Conversion', 'statistical_significance'] = 1
else:
        statistical_significance.loc['Gross Conversion', 'statistical_significance'] = 0
statistical_significance

Unnamed: 0,dmin,observed,lower_bound,upper_bound,statistical_significance
Gross Conversion,0.01,-0.020555,-0.010399,0.010399,1.0
Net Conversion,0.0075,,,,


In [72]:
control_net_conversion = control_valid['Payments'].sum() / control_valid['Clicks'].sum()
experiment_net_conversion = experiment_valid['Payments'].sum() / experiment_valid['Clicks'].sum()
diff_observed = experiment_net_conversion - control_net_conversion
pooled_prob = control_net_conversion + experiment_net_conversion
a = 1 / control_valid['Clicks'].sum()
b = 1 / experiment_valid['Clicks'].sum()
std = ((a + b) * (pooled_prob) * (1 - pooled_prob)) ** 0.5
margin_of_error = 1.96 * std
lower_bound = -margin_of_error
upper_bound = margin_of_error
statistical_significance.loc['Net Conversion', 'observed'] = diff_observed
statistical_significance.loc['Net Conversion', 'lower_bound'] = lower_bound
statistical_significance.loc['Net Conversion', 'upper_bound'] = upper_bound
if abs(diff_observed) > statistical_significance.loc['Net Conversion', 'dmin']:
    if lower_bound <= diff_observed <= upper_bound:
        statistical_significance.loc['Net Conversion', 'statistical_significance'] = 0
    else:
        statistical_significance.loc['Net Conversion', 'statistical_significance'] = 1
else:
        statistical_significance.loc['Net Conversion', 'statistical_significance'] = 0
statistical_significance

Unnamed: 0,dmin,observed,lower_bound,upper_bound,statistical_significance
Gross Conversion,0.01,-0.020555,-0.010399,0.010399,1
Net Conversion,0.0075,-0.004874,-0.008878,0.008878,0


## Sign Test

In [105]:
sign_test = pd.DataFrame(columns = ['pvalue', 'pvalue < 0.05'],
                        index = ['Gross Conversion', 'Net Conversion'])
sign_test

Unnamed: 0,pvalue,pvalue < 0.05
Gross Conversion,,
Net Conversion,,


In [106]:
def prob(x,n):
    p= round(math.factorial(n)/(math.factorial(x)* math.factorial(n-x))*0.5**x*0.5**(n-x),8)
    return p
def pvalue(x,n):
    p = 0
    for i in range(0, x+1):
        p = p + prob(i, n)
    return 2 * p

In [107]:
control_valid['GC'] =  control_valid['Enrollments'] / control_valid['Clicks']
experiment_valid['GC'] = experiment_valid['Enrollments'] / experiment_valid['Clicks']
combined = control_valid.merge(experiment_valid, on = 'Date', suffixes = ('_control', '_experiment'))[['GC_control', 'GC_experiment']]
combined['sign'] = np.where(combined['GC_experiment'] > combined['GC_control'], 1, 0)
p = combined.sign[combined['sign'] == 1].count()
n = combined.sign.count()
sign_test.loc['Gross Conversion', 'pvalue'] = pvalue(p,n)
sign_test.loc['Gross Conversion', 'pvalue < 0.05'] = int(pvalue(p,n) < 0.05)
sign_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_valid['GC'] =  control_valid['Enrollments'] / control_valid['Clicks']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  experiment_valid['GC'] = experiment_valid['Enrollments'] / experiment_valid['Clicks']


Unnamed: 0,pvalue,pvalue < 0.05
Gross Conversion,0.002599,1.0
Net Conversion,,


In [108]:
control_valid['NC'] =  control_valid['Payments'] / control_valid['Clicks']
experiment_valid['NC'] = experiment_valid['Payments'] / experiment_valid['Clicks']
combined = control_valid.merge(experiment_valid, on = 'Date', suffixes = ('_control', '_experiment'))[['NC_control', 'NC_experiment']]
combined['sign'] = np.where(combined['NC_experiment'] > combined['NC_control'], 1, 0)
p = combined.sign[combined['sign'] == 1].count()
n = combined.sign.count()
sign_test.loc['Net Conversion', 'pvalue'] = pvalue(p,n)
sign_test.loc['Net Conversion', 'pvalue < 0.05'] = int(pvalue(p,n) < 0.05)
sign_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_valid['NC'] =  control_valid['Payments'] / control_valid['Clicks']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  experiment_valid['NC'] = experiment_valid['Payments'] / experiment_valid['Clicks']


Unnamed: 0,pvalue,pvalue < 0.05
Gross Conversion,0.002599,1
Net Conversion,0.677639,0
