In [27]:
#from scipy.stats import ttest_ind
from statsmodels.stats.weightstats import ttest_ind
from statsmodels.stats.power import TTestIndPower
import numpy as np
from numpy.random import normal
import numpy.random
import scipy.stats as st

def winsorize(l, minv, maxv):
    return [min(maxv, max(minv, round(r,2))) for r in l]

### Quantifying historical revenue and state expectations

In [28]:
numpy.random.seed(42)
revenue_min = 5
revenue_mean = 15
revenue_std = 5
revenue_max = 50
samples = 10000
historical_revenue = winsorize(normal(loc=revenue_mean, scale=revenue_std, size=samples), 
                               revenue_min, revenue_max)
historical_stdev = np.std(historical_revenue)
estimated_mean = np.mean(historical_revenue)
print(f'Our historical records show that shopping sessions in our online \
website are worth ~{int(estimated_mean)}$ on average, with a standard deviation of ~{int(historical_stdev)}$')

Our historical records show that shopping sessions in our online website are worth ~15$ on average, with a standard deviation of ~4$


By implementing a new recommender system, we expect a lift of 2$ per session

In [29]:
expected_lift = 2
effect_size = 0.05

Let's calculate how large our A/B testing sould be to get a confident estimate of whether a 1$ lift can be reached. We would like a probability of true positives of at least 0.8 and a probability of false positives of at most 0.01. We can afford to test the recommender system on 5 percent of the user base, maximum

In [30]:
# run a power analyisis to find the size of groups represented by the varibles below, 
# while taking into account the requirements of the above cell

sample_size_control = 95
sample_size_treatment = 5
sample_size_total = sample_size_control + sample_size_treatment
print(f'We must run an A/B test with at least {sample_size_total} users, \
{sample_size_control} in control and {sample_size_treatment} in treatment')

We must run an A/B test with at least 100 users, 95 in control and 5 in treatment


### Run the A/B test

In [31]:
# here we just simulate the obtained results
numpy.random.seed(420)
control_revenue = winsorize(normal(loc=revenue_mean, scale=revenue_std, size=sample_size_control), 
                               revenue_min, revenue_max)

numpy.random.seed(421)
revenue_mean_treatment = 15.3 # let's assume that this is the true mean of revenue after intervention
treatment_revenue = winsorize(normal(loc=revenue_mean_treatment, scale=revenue_std, size=sample_size_treatment), 
                               revenue_min, revenue_max)

### Check statistical significance

In [36]:
# check here if the difference between treatment and control is statistically significant

stat, p_value = st.ttest_ind(control_revenue, treatment_revenue)
print(f'statistic: {stat}')
print(f'p value: {p_value}')
print(f'Since the p value {p_value} > the effect {effect_size} the result is insignificant')


statistic: -1.4863689573752403
p value: 0.14039171952228868
Since the p value 0.14039171952228868 > the effect 0.05 the result is insignificant


### Check practical significance

In [45]:
# check here if the difference between treatment and control is practially significant

#    1. calculate the confidence interval around the mean
lower, upper = st.t.interval(alpha=0.95, # 95% confidence
                            df=len(treatment_revenue)-1, # degrees of freedom
                            loc=np.mean(treatment_revenue), # average in treatment
                            scale=st.sem(treatment_revenue)) # standard error in treatment
print(f'lower: {lower}')
print(f'upper: {upper}')
#    2. check whether the lift is as high as expected
# To estimate if the magnitude of the gap is high enough, check whether 
# the lower confidence interval of treatment is higher than control + lift
control_avg = np.average(control_revenue)
expected = control_avg + expected_lift
print(f'Since the the lower confidence interval {lower} is less than the control + lift {expected} \n there is no practical significance bewteen control and treatment')



lower: 13.606380878906565
upper: 21.80161912109343
Since the the lower confidence interval 13.606380878906565 is less than the control + lift 16.386105263157894 
 there is no practical significance bewteen control and treatment


  lower, upper = st.t.interval(alpha=0.95, # 95% confidence


### Repeat for a use-case of CTR lift

A small e-commerce website is offering a discount for users to register to a premium service. A banner is shown on the landing page, inviting users to register. Low CTR (0.5%) has been recorded historically on 1000 interactions. The website manager believes that by changing the color of the "call to action" button from red to blue, one could double the number of people clicking. The manager is skeptical and allows for an A/B that includes at most 1% of the user base. The website sevres roughly 100 users per day.

Simulate an A/B test scenario to learn for how long the A/B test should run and to assess whether at the end of it is worth switching color. You can decide by yourself how much the new color is effective in increasing CTR.

In [34]:
from statsmodels.stats.weightstats import ztest as ztest
from statsmodels.stats.power import zt_ind_solve_power
numpy.random.seed(42)
historical_clicks = numpy.random.binomial(1, 0.005, size=1000)