In [1]:
# Define functions:
from __future__ import division
from math import sqrt

# Confidence interval function:
def ci(crit_value, se, statistic):
    ci_val = crit_value * se
    return ci_val, statistic - ci_val, statistic + ci_val

# SE for binomial distribution:
def se_binom(p, N):
    # Return SE for a binomial distribution
    return sqrt((p * (1 - p)) * pooled_N(N))

# Define N for se_binom, needs to account pooling:
def pooled_N(N):
    if isinstance(N, list):
        return sum([1/int(val) for val in N])
    else:
        return 1/N

# Function to define days needed to run an experiment:
def days_needed_to_run(needed_size, daily_visits, traffic_ratio):
    return needed_size / daily_visits / traffic_ratio

# a factory function to handle '' when creating ints
def make_int(num):
    return int(num) if num else 0

First define the experiment metrics:

Evaluation:
1. Gross conversion: enroll/start_free_trial —> Should reduce, reject NULL, oneway negative
2. Retention: payment/enroll —> Should increase, reject NULL, oneway positive
3. Net conv: payment/start_free_trial —> Should not change. keep NULL, twoway

Invariant (these should not change between experiment and control):
1. Pageviews
2. Clicks (start a free trial)
3. Clickthrough: clicks/pageviews

## Baseline values

The baseline values for these are:

In [2]:

# probabilities:
p_gross = 0.20625
p_ret = 0.53
p_net = 0.1093125

# Define units of analysis for different metrics
N = 5000
pageview = 40000
click = 3200
prob_click = 3200 / 40000
N_click = N * prob_click
prob_enroll = 660 / 40000
N_enroll = N * prob_enroll

print prob_click
print prob_enroll
print N_click
print N_enroll

0.08
0.0165
400.0
82.5


The defined standard deviations for the baseline values using sample N:

In [3]:
se_gross = se_binom(p_gross, N_click, False)
se_retention = se_binom(p_ret, N_enroll, False)
se_net = se_binom(p_net, N_click, False)

# Print the expected standard deviations for each metric, round to 4th decimal:
print '%.4f' % se_gross, '%.4f' % se_retention, '%.4f' % se_net

TypeError: se_binom() takes exactly 2 arguments (3 given)

## Sizing
Sample sizes needed for each evaluation metric using:
alpha = 0.05
beta = 0.2

1. (dmin = 0.01): 25835,
2. (dmin = 0.01): 39115,
3. (dmin = 0.0075): 27413



In [4]:
pv_gross = 25835
pv_retention = 39115
pv_net = 27413

# We need to double these to accomodate for both Experiment and Control groups:
# For 2, we would need overall pageviews:
pv_retention_size = pv_retention/prob_enroll * 2
# For the rest:
pv_gross_size = pv_gross/prob_click * 2
pv_net_size = pv_net/prob_click * 2

# Looking at the params, retention will likely be the largest:
print 'Size for Retention: ' + str(pv_retention_size)
# For the rest:
print 'Size for Gross conv: ' + str(pv_gross_size)
print 'Size for Net conv: ' + str(pv_net_size)

Size for Retention: 4741212.12121
Size for Gross conv: 645875.0
Size for Net conv: 685325.0


## Duration and exposure

Adding a notification is a low-risk procedure, we can direct all or almost all traffic to it.

In [5]:
traffic_ratio = 1.0
daily_visits = 40000
needed_size = pv_retention_size

days_needed = days_needed_to_run(needed_size, daily_visits, traffic_ratio)

print 'Days needed to run: ' + str(days_needed)

Days needed to run: 118.53030303


That experiment is way too long. This is because gathering enough pageviews to reliably estimate Retention is huge. Therefore this metric should be omitted:

In [6]:
needed_size = max(pv_gross_size, pv_net_size)

days_needed = days_needed_to_run(needed_size, daily_visits, traffic_ratio)
print 'Days needed to run: ' + str(days_needed)

Days needed to run: 17.133125


## Sanity checks for expected values

Confidence intervals are computed for the values that are expected to be observed.

We expect the observed values to fall within the confidence interval, else they are expected to not come from same population (eg. have different population parameter(s)).

In [7]:
## Load data:

import csv

# Control:
with open('data/control.csv', 'rb') as f:
    reader = csv.reader(f, delimiter = ';')
    # skip headers
    reader.next()

    # init dict of arrays:
    obs = {}
    obs['pageviews'] = []
    obs['clicks'] = []
    obs['enrollments'] = []
    obs['payments'] = []
    # Add real clicks and pageviews
    obs['real_pageviews'] = []
    obs['real_clicks'] = []
    for row in reader:
        obs['pageviews'].append(make_int(row[0]))
        obs['clicks'].append(make_int(row[1]))
        obs['enrollments'].append(make_int(row[2]))
        obs['payments'].append(make_int(row[3]))
        # Add real clicks and pageviews since it seems that the
        # enrollments and payments stop at some point in the dataset:
        if (make_int(row[3]) != 0 and make_int(row[2]) != 0):
            obs['real_pageviews'].append(make_int(row[0]))
            obs['real_clicks'].append(make_int(row[1]))
    # done

# Experiment:
with open('data/experiment.csv', 'rb') as f:
    reader = csv.reader(f, delimiter = ';')
    # skip headers
    reader.next()

    # init dict of arrays:
    exp = {}
    exp['pageviews'] = []
    exp['clicks'] = []
    exp['enrollments'] = []
    exp['payments'] = []
    # Add real clicks and pageviews
    exp['real_pageviews'] = []
    exp['real_clicks'] = []
    for row in reader:
        exp['pageviews'].append(make_int(row[0]))
        exp['clicks'].append(make_int(row[1]))
        exp['enrollments'].append(make_int(row[2]))
        exp['payments'].append(make_int(row[3]))
        # Add real clicks and pageviews since it seems that the
        # enrollments and payments stop at some point in the dataset:
        if (make_int(row[3]) != 0 and make_int(row[2]) != 0):
            exp['real_pageviews'].append(make_int(row[0]))
            exp['real_clicks'].append(make_int(row[1]))
    # done

In [8]:

# Confidence intervals for observed values:

crit_value = 1.96 # When alpha = 95%

# Define expected statistics for sanity checks:

# first two define the expected proportion of values falling to control-group
p_pageview = 0.5
p_click = 0.5
# the last is the clickthrough in control
p_clickthrough = sum(obs['clicks']) / sum(obs['pageviews'])
print p_clickthrough
# Define N's for them:
N_pageview = sum(obs['pageviews']) + sum(exp['pageviews'])
N_click = sum(obs['clicks']) + sum(exp['clicks'])
N_clickthrough = sum(obs['pageviews'])

# Define SE's for them:
se_pageview = se_binom(p_pageview, N_pageview)
se_click = se_binom(p_click, N_click)
se_clickthrough = se_binom(p_clickthrough, N_clickthrough)

# Expected confidence intervals:
print 'CI for pageviews'
ci_pageviews = ci(crit_value, se_pageview, p_pageview)
print '%.4f' % ci_pageviews[0], '%.4f' % ci_pageviews[1], '%.4f' % ci_pageviews[2]
print 'CI for clicks'
ci_clicks = ci(crit_value, se_click, p_click)
print '%.4f' % ci_clicks[0], '%.4f' % ci_clicks[1], '%.4f' % ci_clicks[2]
print 'CI for clickthrough'
ci_clickthrough = ci(crit_value, se_clickthrough, p_clickthrough)
print '%.4f' % ci_clickthrough[0], '%.4f' % ci_clickthrough[1], '%.4f' % ci_clickthrough[2]

0.0821258135746
CI for pageviews
0.0012 0.4988 0.5012
CI for clicks
0.0041 0.4959 0.5041
CI for clickthrough
0.0009 0.0812 0.0830


In [9]:
# Now let's define the observed values for the sanity checks:

pageview_observed = sum(obs['pageviews']) / (sum(obs['pageviews']) + sum(exp['pageviews']))
click_observed = sum(obs['clicks']) / (sum(obs['clicks']) + sum(exp['clicks']))
# Here we use that the clickthrough in experiment and check that it falls within the CI's of control values.
clickthrough_observed = sum(exp['clicks']) / sum(exp['pageviews'])

print '%.4f' % pageview_observed
print '%.4f' % click_observed
print '%.4f' % clickthrough_observed
# All are within the confidence intervals, therefore all pass!

0.5006
0.5005
0.0822


## Effective size tests

Next, let's compute the confidence intervals around the evaluation metrics.

These metrics were:
Gross conversion: enroll/start_free_trial
Net conv: payment/start_free_trial

In [26]:
print sum(obs['real_clicks']), sum(exp['real_clicks'])

17293 17260


In [13]:
# Define d hat's, they are the measured difference between control and experiment:
d_eval_gross = sum(exp['enrollments']) / sum(exp['real_clicks']) -\
    sum(obs['enrollments']) / sum(obs['real_clicks'])
d_eval_net = sum(exp['payments']) / sum(exp['real_clicks']) -\
    sum(obs['payments']) / sum(obs['real_clicks'])

print d_eval_gross
print d_eval_net
# Define p's, they are pooled probabilities of control and experiment:
p_eval_gross = (sum(exp['enrollments']) + sum(obs['enrollments'])) /\
    (sum(obs['real_clicks']) + sum(exp['real_clicks']))
p_eval_net = (sum(exp['payments']) + sum(obs['payments'])) /\
    (sum(obs['real_clicks']) + sum(exp['real_clicks']))
print p_eval_gross
print p_eval_net
# Define N's, define as array since we calculate a pooled SE:
n_eval_gross = [sum(obs['real_clicks']), sum(exp['real_clicks'])]
n_eval_net = n_eval_gross

print n_eval_gross
print n_eval_net
# Define SE's
se_eval_gross = se_binom(p_eval_gross, n_eval_gross)
se_eval_net = se_binom(p_eval_net, n_eval_net)

print se_eval_gross
print se_eval_net

# Define CI's
crit_value = 1.96
m_eval_gross, ci_eval_gross_lower, ci_eval_gross_upper =\
    ci(crit_value, se_eval_gross, d_eval_gross)
m_eval_net, ci_eval_net_lower, ci_eval_net_upper =\
    ci(crit_value, se_eval_net, d_eval_net)

print m_eval_gross
print m_eval_net

print 'Confidence intervals for the change in gross:'
print '%.4f' % ci_eval_gross_lower, '%.4f' % ci_eval_gross_upper, '%.4f' % d_eval_gross

print 'Confidence intervals for the change in net:'
print '%.4f' % ci_eval_net_lower, '%.4f' % ci_eval_net_upper, '%.4f' % d_eval_net

-0.0205548745804
-0.00487372267454
0.208607067404
0.115127485312
[17293, 17260]
[17293, 17260]
0.00437167538523
0.00343413351293
0.00856848375504
0.00673090168535
Confidence intervals for the change in gross:
-0.0291 -0.0120 -0.0206
Confidence intervals for the change in net:
-0.0116 0.0019 -0.0049


In [12]:
# Statistical significance:

print 'Gross is statistically significant? Note that gross hypothesis was negative one-way'
print d_eval_gross < -m_eval_gross

print 'Net is statistically significant? Note that net hypothesis was two-way'
print d_eval_net > m_eval_net or d_eval_net < -m_eval_net

# Practical significance:
dmin_gross = 0.01
dmin_net = 0.0075

print 'Gross is practically significant?'
print d_eval_gross < -dmin_gross

print 'Net is practically significant?'
print d_eval_net > dmin_net or d_eval_net < -dmin_net

Gross is statistically significant? Note that gross hypothesis was negative one-way
True
Net is statistically significant? Note that net hypothesis was two-way
False
Gross is practically significant?
True
Net is practically significant?
False


## Sign tests

In [87]:
# Let's define the success rate of the experiment day-by-day:
successes_gross = []
successes_net = []
for i in range(len(exp['real_clicks'])):
    if (exp['enrollments'][i] / exp['clicks'][i]) >\
        (obs['enrollments'][i] / obs['clicks'][i]):
        successes_gross.append(1)
    else:
        successes_gross.append(0)

    if (exp['payments'][i] / exp['clicks'][i]) >\
        (obs['payments'][i] / obs['clicks'][i]):
        successes_net.append(1)
    else:
        successes_net.append(0)
# Number of days:
num_of_days = len(exp['real_clicks'])

print 'Number of successes for gross: ' + str(sum(successes_gross))
print 'Number of successes for net: ' + str(sum(successes_net))
print 'Number of days: ' + str(num_of_days)

Number of successes for gross: 4
Number of successes for net: 10
Number of days: 23


Using an online calculator (source: http://graphpad.com/quickcalcs/binomial1.cfm), the values for the sign test are

p_gross = 0.0026 , given two-tailed test. Therefore indicating statistical significance, given alpha of 0.05.

p_net = 0.6776  , given two-tailed test. Therefore not indicating statistical significance given alpha of 0.05.