In [1]:
import pandas as pd
import numpy as np
import scipy
import math
import random
from scipy import stats
from random import sample

In [2]:
df = pd.read_csv("AB_test_data.csv")
df.head()
df.describe

<bound method NDFrame.describe of         purchase_TF Variant        date        id
0             False       A  2019-11-08  0x25b44a
1             False       B  2020-08-27  0x46271e
2             False       A  2020-06-11  0x80b8f1
3             False       B  2020-08-22  0x8d736d
4             False       A  2020-08-05  0x96c9c8
...             ...     ...         ...       ...
129995        False       A  2020-07-23  0x4089c2
129996        False       A  2020-06-24  0x6a5e3a
129997        False       A  2019-10-12  0x95e302
129998        False       A  2020-03-18  0x7c4afa
129999        False       A  2019-12-09  0x380071

[130000 rows x 4 columns]>

In [3]:
#Date may not be that influencial, so we remove the data column
df = df[df.columns.drop('date')]
df.head()

Unnamed: 0,purchase_TF,Variant,id
0,False,A,0x25b44a
1,False,B,0x46271e
2,False,A,0x80b8f1
3,False,B,0x8d736d
4,False,A,0x96c9c8


In [4]:
#Divide the control and treatment group
Group_A = df.loc[df['Variant'] == 'A']
Group_B = df.loc[df['Variant'] == 'B']

In [5]:
#Group_A = Control
Group_A.head()

Unnamed: 0,purchase_TF,Variant,id
0,False,A,0x25b44a
2,False,A,0x80b8f1
4,False,A,0x96c9c8
5,False,A,0x751c24
6,False,A,0x60d2bd


In [6]:
#Group_B = Treatment
Group_B.head()

Unnamed: 0,purchase_TF,Variant,id
1,False,B,0x46271e
3,False,B,0x8d736d
59,False,B,0x3ff83f
74,False,B,0x138d19
103,False,B,0x966e6a


In [7]:
A_list = list(Group_A['purchase_TF'])
B_list = list(Group_B['purchase_TF'])

### 1. Conduct an A/B test to determine whether Alternative B improved conversion rates (site users book the property) over alternative A.

In [8]:
print("The mean for group A is:", np.mean(A_list))
print("The mean for group B is:", np.mean(B_list))

The mean for group A is: 0.149616
The mean for group B is: 0.1766


According to the mean comparison, the mean for treatment group is higher (increased by 18.04%).

Seems that the treatment group is effective. Now we'd like to use T-test to double-check.

In [9]:
norm = scipy.stats.norm()

We choose T-test method although the sample size is large: As sample size keeps growing, the results for t-test and z-test will gradually become similar.

We use equal variances/One-tailed strategy(only to see improvement) and write a function for the purpose.

In [10]:
def t_test(List_A,List_B,confidence_level):
    Mean_A = np.mean(List_A)
    Mean_B = np.mean(List_B)
    s = 0
    
    for i in range(len(List_A)):
        s = s + (List_A[i] - Mean_A)**2
    for j in range(len(List_B)):
        s = s + (List_B[j] - Mean_B)**2
        
    s = np.sqrt(s /(len(List_A) + len(List_B) - 2))
    t_stats = (Mean_B-Mean_A) / (s*np.sqrt(1/len(List_A) + 1/len(List_B)))
    
    if t_stats >= norm.ppf(1-(1-confidence_level)/2):
        print('Improvment of treatment group B is siginificant.')
    else:
        print('Improvment of treatment group B is not quite siginificant.')
        
    print('The T-score is: ', t_stats)
    return t_stats

In [11]:
#At 95% confidence interval
t_test(A_list,B_list,0.95)

Improvment of treatment group B is siginificant.
The T-score is:  5.2309979343658375


5.2309979343658375

### 2. Calculate the optimal sample size for a 95% confidence rate and test with 80% power. 

### Conduct the test 10 times using samples of the optimal size. Report results.

To calculate optimal sample size for the given confidence level/power, we write a function as follows.

In [50]:
def sample_size_estimation(List_A,List_B,confidence_level,power):
    Mean_A = np.mean(List_A)
    Mean_B = np.mean(List_B)
    Var_A = np.var(List_A)
    Var_B = np.var(List_B)

    mde = Mean_A-Mean_B
    print(mde)
    p = (Mean_A+Mean_B)/2
    n = (((norm.ppf(1-(1-confidence_level)/2)*np.sqrt(2*p*(1-p)) + norm.ppf(power)*np.sqrt(Mean_A*(1-Mean_A)+Mean_B*(1-Mean_B))))**2)/(mde**2)
    
    return n

In [54]:
optimal_size = sample_size_estimation(A_list,B_list,0.95,0.8)
optimal_size = int(optimal_size)
print("The optimal size at this confidence level/power should be:", optimal_size)

-0.026984000000000008
The optimal size at this confidence level/power should be: 2941


Next, we'd like to use the optimal size to conduct the test 10 times. Again, we write a function for it.

In [65]:
def t_test_multitimes(List_A,List_B,confidence_level,sample_size,n_times):
    for i in range(n_times):
        #random sampling
        Size_A = sample(List_A,sample_size)
        Size_B = sample(List_A,sample_size)
        #using the new sample to run the t_test again
        print("For round ",i+1,", the result is:")
        t_test(Size_A,Size_B,confidence_level)
        print(" ")
    return

In [71]:
t_test_multitimes(A_list,B_list,0.95,optimal_size,10)

For round  1 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  0.11053412387732954
 
For round  2 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  -1.774108925328981
 
For round  3 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  -1.6977171252909788
 
For round  4 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  -0.9631467785646283
 
For round  5 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  -0.7989193353757855
 
For round  6 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  0.8402841880106529
 
For round  7 , the result is:
Improvment of treatment group B is siginificant.
The T-score is:  2.474636011826321
 
For round  8 , the result is:
Improvment of treatment group B is not quite siginificant.
The T-score is:  -2.324513000

### 3. Conduct a sequential test for the 10 samples. For any of the samples, were you able to stop the test prior to using the full sample? What was the average number of iterations required to stop the test?

In [72]:
def t_test_sequential(List_A,List_B,confidence_level,sample_size,n_times,power):
    ttl_iteration = []
    
    for i in range(n_times):
        Size_A = sample(List_A,sample_size)
        Size_B = sample(List_A,sample_size)
        ln_A = np.log(1/(1-confidence_level))
        ln_B = np.log(1-power)
        total_sample = Size_A + Size_B
        random.shuffle(total_sample)
        p_A = np.mean(Size_A)
        p_B = np.mean(Size_B)
        accumulative_log_lamda = 0
        t=0
        
        while (accumulative_log_lamda>ln_B) and (accumulative_log_lamda <ln_A):
            if total_sample[t]:
                accumulative_log_lamda += np.log(p_A/p_B)
            else:
                accumulative_log_lamda += np.log((1-p_A)/(1-p_B))
            t += 1
            if t >= len(total_sample):
                print('Cannot reject or accept H0')
                break
            else:
                continue
                
        if accumulative_log_lamda <= ln_B:
            print("Accept H0 in",t,"trials.")
    
        elif accumulative_log_lamda >= ln_A:
            print("Reject H0 in",t,"trials.")
            
        ttl_iteration.append(t)
        
    average_iteration = np.average(ttl_iteration)
    return np.average(average_iteration)

In [77]:
t_test_sequential(A_list,B_list,0.95,optimal_size,10,0.8)

Cannot reject or accept H0
Cannot reject or accept H0
Cannot reject or accept H0
Cannot reject or accept H0
Cannot reject or accept H0
Cannot reject or accept H0
Accept H0 in 94 trials.
Accept H0 in 1482 trials.
Accept H0 in 1301 trials.
Cannot reject or accept H0


4405.1