In [2]:
import pandas as pd
import numpy as np

In [3]:
import scipy.stats as scs
import pandas as pd

def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A',
                  test_label='B'):
    """Returns a pandas dataframe with fake CTR data
    Example:
    Parameters:
        N_A (int): sample size for control group
        N_B (int): sample size for test group
            Note: final sample size may not match N_A provided because the
            group at each row is chosen at random (50/50).
        p_A (float): conversion rate; conversion rate of control group
        p_B (float): conversion rate; conversion rate of test group
        days (int): optional; if provided, a column for 'ts' will be included
            to divide the data in chunks of time
            Note: overflow data will be included in an extra day
        control_label (str)
        test_label (str)
    Returns:
        df (df)
    """

    # initiate empty container
    data = []

    # total amount of rows in the data
    N = N_A + N_B

    # distribute events based on proportion of group size
    group_bern = scs.bernoulli(N_A / (N_A + N_B))

    # initiate bernoulli distributions from which to randomly sample
    A_bern = scs.bernoulli(p_A)
    B_bern = scs.bernoulli(p_B)

    for idx in range(N):
        # initite empty row
        row = {}
        # for 'ts' column
        if days is not None:
            if type(days) == int:
                row['ts'] = idx // (N // days)
            else:
                raise ValueError("Provide an integer for the days parameter.")
        # assign group based on 50/50 probability
        row['group'] = group_bern.rvs()

        if row['group'] == 0:
            # assign conversion based on provided parameters
            row['converted'] = A_bern.rvs()
        else:
            row['converted'] = B_bern.rvs()
        # collect row into data container
        data.append(row)

    # convert data into pandas dataframe
    df = pd.DataFrame(data)

    # transform group labels of 0s and 1s to user-defined group labels
    df['group'] = df['group'].apply(
        lambda x: control_label if x == 0 else test_label)

    return df

In [4]:
# code examples presented in Python
bcr = 0.10  # baseline conversion rate
d_hat = 0.02  # difference between the groups

# A is control; B is test
N_A = 1000
N_B = 1000

ab_data = generate_data(N_A, N_B, bcr, d_hat)

In [5]:
ab_data.head()

Unnamed: 0,group,converted
0,A,0
1,A,0
2,A,0
3,A,0
4,A,0


In [6]:
ab_summary = ab_data.pivot_table(values='converted', index='group', aggfunc=np.sum)

# add additional columns to the pivot table
ab_summary['total'] = ab_data.pivot_table(values='converted', index='group', aggfunc=lambda x: len(x))
ab_summary['rate'] = ab_data.pivot_table(values='converted', index='group')

In [10]:
ab_summary

Unnamed: 0_level_0,converted,total,rate
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,97,998,0.097194
B,21,1002,0.020958


In [6]:
import plotly.express as px
import chart_studio.plotly as py

fig = px.scatter(ab_summary, x="gdpPercap", y="lifeExp", animation_frame="year", animation_group="country",
           size="pop", color="continent", hover_name="country",
           log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])

fig.write_html("plotly.html")

pandas.core.frame.DataFrame

In [18]:
import math

A_half = math.ceil(ab_summary.loc['A','converted']/2)
if ab_summary.loc['A','converted']%2 ==0:
    total_range = ab_summary.loc['A','converted']
else:
    total_range = ab_summary.loc['A','converted']+1

xA = np.linspace(ab_summary.loc['A','converted']- (A_half-1), 
                     ab_summary.loc['A','converted']+ A_half, total_range)
yA = scs.binom(ab_summary.loc['A','total'], ab_summary.loc['A','rate']).pmf(xA)
xB = np.linspace(ab_summary.loc['B','converted']- (A_half-1), 
                 ab_summary.loc['B','converted']+ A_half, total_range)
yB = scs.binom(ab_summary.loc['B','total'], ab_summary.loc['B','rate']).pmf(xB)

In [19]:
#math.floor(ab_summary.loc['A','converted']/2)
#math.ceil(ab_summary.loc['A','converted']/2)
xB

array([-27., -26., -25., -24., -23., -22., -21., -20., -19., -18., -17.,
       -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,  -7.,  -6.,
        -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,   4.,   5.,
         6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,  15.,  16.,
        17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,  26.,  27.,
        28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,  37.,  38.,
        39.,  40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,  48.,  49.,
        50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.,  60.,
        61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,  70.])

In [20]:
display_data = pd.DataFrame(xA)

display_data.columns =['x']
display_data['y'] =yA

display_data['test_group'] = 'A'
display_data

Unnamed: 0,x,y,test_group
0,49.0,8.061240e-09,A
1,50.0,1.647195e-08,A
2,51.0,3.296326e-08,A
3,52.0,6.462844e-08,A
4,53.0,1.241898e-07,A
...,...,...,...
93,142.0,1.262854e-06,A
94,143.0,8.138380e-07,A
95,144.0,5.202218e-07,A
96,145.0,3.298567e-07,A


In [21]:
data2 = pd.DataFrame(xB)
data2.columns =['x']
data2['y'] =yB
data2['test_group'] = 'B'
display_data = display_data.append(data2, ignore_index=True)

display_data.columns =['converted', 'probability', 'test_group']

In [22]:
import plotly.express as px
import plotly.graph_objs as go

fig = px.bar(display_data, x="converted", y="probability", color="test_group"
   #, animation_frame=display_data.index , animation_group="test_group"
            )
fig.show()

In [39]:
display_data.describe()

Unnamed: 0,x,y
count,200.0,200.0
mean,69.5,0.009999997
std,48.642372,0.01752984
min,-19.0,0.0
25%,30.75,2.366984e-07
50%,69.5,0.0002018884
75%,108.25,0.01282478
max,158.0,0.07373105
