<a href="https://colab.research.google.com/github/tania-z/ab_test/blob/main/Sample_Size_for_A_B_Test_for_CR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Go to "Runtime --> Run All"

import pandas as pd
from scipy import stats 
from scipy.stats import norm
import statsmodels.stats.api as sms
import seaborn as sns
import plotly.graph_objects as go
import os
from datetime import datetime
import math

def calculate_sample_size(p1, p2, control_size, test_size, alpha, beta):
    if test_size + control_size == 100 :
      ratio = test_size / control_size
      z_score_alpha = norm.ppf(1 - (alpha / 2))
      z_score_beta = norm.ppf(1 - beta)

      p_bar = (p1+ratio*p2)/(1+ratio)
      q_bar = 1 - p_bar

      numerator = (z_score_alpha * math.sqrt(p_bar * q_bar * (1 + 1 / ratio))) \
                  + (z_score_beta * math.sqrt(p1 * (1 - p1) + p2 * (1 - p2) / ratio))
      denominator = p1 - p2

      sample_size_control = math.ceil((numerator / denominator) ** 2)
      sample_size_test = math.ceil(sample_size_control * ratio)
      return sample_size_control, sample_size_test
    else: 
      raise ValueError('Alarm! test+control must be 100% in total')

## ⤵ But First, modify your params here

In [None]:
test_name = '/lp12'
daily_traffic = 8000    
p1 = 0.1805           # Control Group CR (History)
uplift = 0.025         # Test CR Uplift (Expected)
control_size = 80     # Control Size (%)
test_size = 20        # Test Size (%)

alpha=0.05            # Alpha 
beta=0.2              # Beta = 1- Power, likely do not change


In [None]:
#@title Results
p2 = p1 * (1+uplift)
sample_size_control, sample_size_test = calculate_sample_size(p1, p2, control_size, test_size, alpha, beta)

duration =  math.ceil((sample_size_control + sample_size_test) / daily_traffic)

summary_text = f"""
{test_name}
control CR = {p1 * 100:.2f}%
expeted CR = {p2 * 100:.2f}%
uplift     = {(p2/p1-1) * 100:.2f}%
for {control_size}/{test_size} traffic allocation we need to collect
sample_size_control: {sample_size_control:,}
sample_size_test:    {sample_size_test:,}
--------------------------------------------------------------------
TOTAL SIZE: {sample_size_control + sample_size_test:,}  || Approx. {duration:,} days for daily {daily_traffic:,} users 
--------------------------------------------------------------------
Only for binominal distibution, two-sided test
"""

print(summary_text)

## Iterate over UPLIFTS
sample_sizes_control = []
sample_sizes_test = []
deltas = [0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15]
for delta in deltas:
  p2=p1*(1+delta)
  sample_size_control, sample_size_test = calculate_sample_size(p1, p2, control_size, test_size, alpha=0.05, beta=0.2)
  sample_sizes_control.append(sample_size_control)
  sample_sizes_test.append(sample_size_test)

sum_list = [control + test for control, test in zip(sample_sizes_control, sample_sizes_test)]
total_list = [math.ceil(total / daily_traffic) for total in sum_list]


## Chart size
fig = go.Figure()
fig.add_trace(go.Scatter(x=deltas, y=sample_sizes_control, name='Control Group'))
fig.add_trace(go.Scatter(x=deltas, y=sample_sizes_test, name='Test Group'))

fig.update_layout(
    width=1200,
    height=450,
    title=f'Minimum required sample size for detectable delta for CR metric for {control_size}/{test_size} traffic allocation',
    xaxis_title='Uplift (Delta)',
    yaxis_title='Sample Size',
    template="plotly_white"
    #legend=dict(x=0, y=1, xanchor='left', yanchor='top')
)
fig.show()


## Chart duratiom
figd = go.Figure()
figd.add_trace(go.Scatter(x=deltas, y=total_list, name='Duration Days', showlegend=True))

figd.update_layout(
    width=1200,
    height=450,
    title=f'Required days for minimum detectable delta for CR metric for {control_size}/{test_size} traffic allocation',
    xaxis_title='Uplift (Delta)',
    yaxis_title='Days',
    template="plotly_white",
    #legend=dict(x=0, y=1, xanchor='right', yanchor='top')
)
figd.show()

figtable = go.Figure()

# Creating a table
data = {
    'Deltas': deltas,
    'Sample Sizes Control': sample_sizes_control,
    'Sample Sizes Test': sample_sizes_test
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data).reset_index(drop=True)
#df['Deltas'] = df['Deltas'].str.replace(',', '').astype(int).apply(str) + '%'
df['Total Size'] = df['Sample Sizes Control'] + df['Sample Sizes Test']
transposed_df = df.transpose()
transposed_df.iloc[:, 1:] = transposed_df.iloc[:, 1:].astype(int)
#print(transposed_df)





/lp12
control CR = 18.05%
expeted CR = 18.50%
uplift = 2.50%
for 80/20 traffic allocation we need to collect
sample_size_control: 287,183
sample_size_test:    71,796
--------------------------------------------------------------------
TOTAL SIZE: 358,979  || Approx. 45 days for daily 8,000 users 
--------------------------------------------------------------------
Only for binominal distibution, two-sided test



In [None]:
#@title Save to HTML. Find in Folder on the Left
current_timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
summary_text_html = summary_text.replace('\n', '<br>')
title_html = f'{test_name} Summary // {current_timestamp}'

# Define the CSS styles
css_styles = '''
<style>
    body {
        font-family: 'Arial', sans-serif;
    }

    table {
        border-collapse: collapse;
        width: 100%;
    }

    th, td {
        text-align: left;
        padding: 8px;
        border-bottom: 1px solid #ddd;
    }

    th {
        background-color: #f2f2f2;
    }
</style>
'''

# Define the HTML content
html_content = f'''
<html>
<head>
    <title>{title_html}</title>
    {css_styles}
</head>
<body>
    <h1>{title_html}</h1>
    <pre>{summary_text_html}</pre>
    <div>{fig.to_html()}</div>
    <div>{figd.to_html()}</div>
</body>
</html>
'''

# Save the HTML content to a file
if not os.path.exists('data'):
    os.makedirs('data')

normalized_test_name = os.path.normpath(test_name)
normalized_test_name = normalized_test_name.replace('/', '_') 
normalized_test_name = normalized_test_name.replace('\\', '_') 
normalized_test_name = normalized_test_name.replace('__', '_') 
normalized_test_name = normalized_test_name.replace('__', '_') 
filename = f"AB_SampleSize_{normalized_test_name}_uplift_{uplift* 100}.html"
file_path = os.path.join('data', filename)

# Save the HTML content to the file
with open(file_path, 'w') as file:
    file.write(html_content)

print(f"HTML file '{file_path}' has been saved.")

HTML file 'data/AB_SampleSize__lp12_uplift_2.5.html' has been saved.
