# <font color=blue>A/B Test Data Generator</font>
### <font color=#4D4D4D>Author: Warren Silva</font>
---
This is a tool used to create sample data suitable for A/B testing.

In [1]:
# libraries
import pandas as pd
import numpy as np
import csv
import random
import string
from datetime import datetime, timedelta
from faker import Faker
fake = Faker()
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.5f}'.format
import scipy.stats as stats

## <font color=blue>Settings</font>

In [2]:
# general settings 
record_count = 100000
export = False
binary_outcome = False # set to False for a continuous outcome

# id settings
prefix_length = 2
start_id = 11303
skip_chance = 0.3
repeat_chance = 0.009

# date settings
start_date = datetime(2023, 8, 1)
end_date = datetime(2023, 12, 31)

# group settings
group_names = ['control', 'treatment']
group_ratio = 0.517

# binary outcome settings
outcomes = [0,1]
rate_1 = 0.165
rate_2 = 0.210

# continuous outcome settings
zero_pct_1 = 0.962
zero_pct_2 = 0.966
min_val = 5
max_val = 175
treatment_scale = 0.9905

# outlier settings
outlier_pct = 0.00009  
outlier_min_val = 800  
outlier_max_val = 2500  

## <font color=blue>Functions</font>

In [3]:
# functions
def id_prefix(num_characters=prefix_length):
    characters = string.ascii_lowercase
    prefix = ''.join(random.choice(characters)for i in range(num_characters))
    return prefix

def create_ids(start=start_id, records=record_count, skip_rate=skip_chance, repeat_rate=repeat_chance):
    id_list = []
    current_val = start
    while len(id_list) < records:
        id_val = id_prefix() + str(current_val)
        if random.random() > skip_chance:
            id_list.append(id_val)
        if random.random() < repeat_chance:
            id_list.append(id_val)
        else:
            current_val += 1
    return id_list

def create_dates(start=start_date, end=end_date, records=record_count):
    date_list = []
    while len(date_list) < records:
        current_val = fake.date_between_dates(start_date, end_date)
        date_list.append(current_val)
    return date_list

def create_binary(records=record_count, ratio=group_ratio, rate_1=rate_1, rate_2=rate_2):
    group_list = []
    outcome_list = []
    while len(group_list) < records:
        # determine group (control or treatment)
        if random.random() > ratio:
            group_list.append(group_names[0]) # control group
            # determine if the outcome is zero or a random value
            if random.random() > rate_1:
                outcome_list.append(outcomes[0])
            else:
                outcome_list.append(outcomes[1])
        else:
            group_list.append(group_names[1])
            if random.random() > rate_2:
                outcome_list.append(outcomes[0])
            else:
                outcome_list.append(outcomes[1])
    return group_list, outcome_list

def create_continuous(records=record_count, ratio=group_ratio, zero_rt_1=zero_pct_1, zero_rt_2=zero_pct_2,
                      min_value=min_val, max_value=max_val, group2_scale=treatment_scale,
                      outlier_pct=outlier_pct, outlier_min=outlier_min_val, outlier_max=outlier_max_val):
    group_list = []
    outcome_list = []
    
    # calculate the number of outliers
    num_outliers = int(records * outlier_pct)
    
    while len(group_list) < records:
        # determine group (control or treatment)
        if random.random() > ratio:
            group_list.append(group_names[0])  # control group
            # determine if the outcome is zero or a random value
            if random.random() < zero_rt_1:
                outcome_list.append(0)
            else:
                outcome_list.append(np.random.uniform(min_value, max_value))
        else:
            group_list.append(group_names[1])  # treatment group
            # determine if the outcome is zero or a random value
            if random.random() < zero_rt_2:
                outcome_list.append(0)
            else:
                outcome_list.append(np.random.uniform(min_value, max_value) * group2_scale)
    
    # introduce outliers
    outlier_indices = np.random.choice(records, num_outliers, replace=False)
    for idx in outlier_indices:
        outcome_list[idx] = np.random.uniform(outlier_min, outlier_max)
        
    return group_list, outcome_list



## <font color=blue>Populate Dataframe</font>

In [4]:
# build series for each feature
id_vals = create_ids()
date_vals = create_dates()
if binary_outcome:
    group_vals, outcome_vals = create_binary()
else:
    group_vals, outcome_vals = create_continuous()

# make dataframe
df = pd.DataFrame({
    'id':id_vals,
    'date':date_vals,
    'group':group_vals,
    'outcome':outcome_vals})

# preview data
df.head()

Unnamed: 0,id,date,group,outcome
0,eu11303,2023-10-24,treatment,0.0
1,zf11305,2023-08-19,treatment,0.0
2,is11309,2023-10-26,control,0.0
3,cv11310,2023-10-01,control,0.0
4,rh11311,2023-08-23,treatment,0.0


## <font color=blue>Export</font>

In [5]:
datestring = datetime.now().strftime("%Y%m%d_%H%M")

if export and binary_outcome:
    out_file = 'binary_sample_' + datestring + '.csv'
    df.to_csv(out_file, index=False)
    print(out_file + ' successfully exported')
elif export and not binary_outcome:
    out_file = 'continuous_sample_' + datestring + '.csv'
    df.to_csv(out_file, index=False)
    print(out_file + ' successfully exported')
else:
    print('No file saved. Set export=True to save results')

No file saved. Set export=True to save results
