# PYTHON. Calculations for AB test analysis

In [1]:
import scipy.stats as st
import math
import numpy as np

## Basic theoretical confidence interval calculation
Assumptions: Normal distribution (functions from **scipy.stats** library)



In [2]:
def conf_interval(clicks, N, conf=0.95, n_tails=2, rnd=4):
    # ppf(q, loc=0, scale=1) Percent point function (inverse of cdf — percentiles).
    z=st.norm.ppf(1-(1-conf)/n_tails)

    p_hat=clicks/N
    # binomial distribution
    standard_error= math.sqrt(p_hat*(1-p_hat)/N)

    margin_of_error = standard_error * z

    lower = p_hat - margin_of_error
    upper = p_hat + margin_of_error

    #st.norm.cdf(z)
    print(f'P_hat:           {round(p_hat,4)}')
    print(f'Standard error:  {round(standard_error,4)}')
    print(f'Margin of error: {round(margin_of_error,4)}')
    print("-"*20)
    print(f"{round(conf*100,1)}% {n_tails}-tail confidence interval is [{round(lower,rnd)} ; {round(upper,4)}]")



In [3]:
N=2000
clicks=300
conf=0.99
n_tails=2

conf_interval(clicks, N, conf=conf, n_tails=n_tails)

P_hat:           0.15
Standard error:  0.008
Margin of error: 0.0206
--------------------
99.0% 2-tail confidence interval is [0.1294 ; 0.1706]


## Sample size
How many observations (at least) need to be collected to have significant results?

* [Even Miller calculator](https://www.evanmiller.org/ab-testing/sample-size.html)
* calculation based on pooled standard error

In [4]:
#https://towardsdatascience.com/mathematical-intuition-behind-a-b-testing-with-python-9d024e5e7f37

def calc_sample_size(alpha, beta, p, delta, method, n_tails=2):
    """ Based on https://www.evanmiller.org/ab-testing/sample-size.html
    Ref: https://stats.stackexchange.com/questions/357336/create-an-a-b-sample-size-calculator-using-evan-millers-post
    Args:
        alpha (float): How often are you willing to accept a Type I error (false positive)?
        power (float): How often do you want to correctly detect a true positive (=1-beta)?
        p (float): Base conversion rate
        pct_mde (float): Minimum detectable effect, relative to base conversion rate.
    """
    if method == 'evanmiller':
        t_alpha2 = st.norm.ppf(1.0-alpha/n_tails)
        t_beta = st.norm.ppf(1-beta)

        sd1 = np.sqrt(2 * p * (1.0 - p))
        sd2 = np.sqrt(p * (1.0 - p) + (p + delta) * (1.0 - p - delta))

        n= round((t_alpha2 * sd1 + t_beta * sd2) * (t_alpha2 * sd1 + t_beta * sd2) / (delta**2))
    elif method == 'pooled_se':
        """
        References:
            Code taken from Nguyen Ngo: https://towardsdatascience.com/the-math-behind-a-b-testing-with-example-code-part-1-of-2-7be752e1d06f
            Stanford lecture on sample sizes     
        """
        # standard normal distribution to determine z-values
        standard_norm = st.norm(0, 1)

        # find Z_beta from desired power
        Z_beta = standard_norm.ppf(1-beta)

        # find Z_alpha
        Z_alpha = standard_norm.ppf(1-alpha/n_tails)

        # average of probabilities from both groups
        pooled_prob = (p + p+delta) / 2

        n= round(2*(pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2
                 / delta**2))
    return n



In [5]:
alpha=0.05
beta=0.2
power=1-beta
d_min=0.02 # delta
p_hat=0.1
pct_mde=p_hat-d_min/p_hat
delta=d_min

n_evanmiller=calc_sample_size(alpha, beta, p_hat, d_min, method='evanmiller')
n_stanford=calc_sample_size(alpha, beta, p_hat, d_min, method='pooled_se')

print (f"Sample size \n1) Evan Miller: {n_evanmiller}  \n2) Pooled_se:  {n_stanford} \nAverage: {round(np.mean([n_stanford,n_evanmiller]))}")


Sample size 
1) Evan Miller: 3623  
2) Pooled_se:  3842 
Average: 3732


## AB test experiment analysis

Pooled Standard error theoretical confidence interval calculation and comparison to practical minimal uplift. 
Verdict on the further actions based on the results

In [6]:
#theoretical
def ab_pooled_theor_sign(N_cont, action_cont, N_exp, action_exp, d_min, conf=0.95, n_tails=2):
    print(f"Control \n N      {N_cont}\n action {action_cont}")
    print(f"Experiment \n N      {N_exp}\n action {action_exp}")
    print("-"*20)
    
    p_hat_cont=action_cont/N_cont
    print(f"P control:    {round(p_hat_cont,4)}")

    p_hat_exp=action_exp/N_exp
    print(f"P experiment: {round(p_hat_exp,4)}")

    p_hat_pool=(action_cont+action_exp)/(N_cont+N_exp)
    print(f"P poopled:    {round(p_hat_pool,4)}")

    standard_error_pool=math.sqrt(p_hat_pool*(1-p_hat_pool)*((1/N_cont) + (1/N_exp)))

    z=st.norm.ppf(1-(1-conf)/n_tails)
    margin_of_error_pool = standard_error_pool * z

    dif_hat=p_hat_exp-p_hat_cont

    p_min=dif_hat-margin_of_error_pool
    p_max=dif_hat+margin_of_error_pool
    print("-"*20)
    print(f'Minimal practical significance {round(d_min,4)}')
    print(f"Confidence interval [{round(p_min,4)},{round(p_max,4)}]")
    
    print("-"*20)
    #stat significance - interval doesn't contain zero
    if p_min<0 and p_max>0:
        print("Confidence interval contains zero. We cannot reject null hypotesis that there is no difference between groups")
    elif p_min>0:
        print("There is a significant positive result")
    else:   
        print("There is a significant negative result")


    #practical significance   - the interval is to the right from min uplift
    if d_min<=p_min:
        print(f"With {conf*100}% confidence there is practical and significant effect greater than d_min ({d_min}). \nVerdict: LAUNCH the change")
    elif p_min>=-d_min and p_max<=d_min:
        print("There is no practical significance. \nVerdict: NO LAUNCH")
    elif p_min<=-d_min and p_max>=d_min:
        print ("Overlap. Results do not provide clarity. \nVerdict: Additional test may bring more solid results")
    elif p_min>=-d_min and p_max>=d_min:
        print ("Intersection on positive side. Results do not provide clarity. \nVerdict: Additional test may bring more solid results")   
    else: 
        print("No Launch")
    print("-"*20)

In [7]:
n_tails=2
conf=0.95
d_min=0.02
#experiment --------------------------------------------------------------
#control
N_cont=10072 # pageviews
action_cont=974 # clicks, views, etc

#experiment (target)
N_exp=9886 # pageviews
action_exp=1242 #clicks, views, etc
# calculations --------------------------------------------------------------

ab_pooled_theor_sign(N_cont, action_cont, N_exp, action_exp, d_min)
        

Control 
 N      10072
 action 974
Experiment 
 N      9886
 action 1242
--------------------
P control:    0.0967
P experiment: 0.1256
P poopled:    0.111
--------------------
Minimal practical significance 0.02
Confidence interval [0.0202,0.0376]
--------------------
There is a significant positive result
With 95.0% confidence there is practical and significant effect greater than d_min (0.02). 
Verdict: LAUNCH the change
--------------------
