In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import seaborn as sns
import datetime

## Get Mock 5 Day Data

In [2]:
data = pd.read_csv('../early_data.csv')
data['bucketing_date'] = pd.to_datetime(data['bucketing_date'])
data.head(3)

Unnamed: 0,requester_id,bucketing_date,gmv
0,6349788,2019-08-28 17:43:25,56.0
1,12543081,2019-08-28 04:50:55,110.0
2,12581997,2019-08-30 12:45:37,90.0


#### Estimate samples per day based on last 7 days

In [3]:
last_7 = data['bucketing_date'].max() - datetime.timedelta(days=7)
samples_per_day = int(len(data.loc[data.bucketing_date >= last_7])/7)
samples_per_day

2712

In [4]:
def sample_data(df, metric, samples, simulations):
    """
    given a dataframe `df` with a metric of interst column `metric`,
    select `samples` number of samples,
    repeat for `simulations` number of simulations
    
    Returns:
        meanOfMeans (float)
        seOfMeans (float)
    """
    metric_series = df[metric]
    means = np.array([metric_series.sample(n=samples, replace=True).mean() for i in range(simulations)])
    return {'meanOfMeans': means.mean(), 'SEOfMeans': means.std(), 'samples': samples}

In [5]:
dist_stats = sample_data(data, 'gmv', int(samples_per_day*5*(2/3)*(1/2)), 1000)

In [6]:
mean = dist_stats['meanOfMeans']
se = dist_stats['SEOfMeans']
samples = dist_stats['samples']
dist_stats

{'meanOfMeans': 44.13574193141593,
 'SEOfMeans': 1.3994443133725878,
 'samples': 4520}

In [9]:
combined_se = ((se**2)+(se**2))**.5
breakevens = [.131, .209]
dof = (se**2+se**2)**2/(se**4/(samples-1) + se**4/(samples-1)) #https://stattrek.com/estimation/difference-in-means.aspx
#welch_t_stat = (obs-contorl)/se_combined
t_score = -1.96 #https://stattrek.com/online-calculator/t-distribution.aspx (p = .025, dof = 9038)

In [10]:
def calc_observed_mean_2(mean_1, conf_lower, combined_se, t_score):
    """Calculate the difference in means you would have to observe 
    to get a 95% confidence interval with the lower bound conf_lower 
    assumes the control and treatment have the same standard error
    
    Args:
        mean_1: mean of the control
        conf_lower: lower bound of the confidence interval
        combined_se: combined standard error of control and treatment
    """
    
    mean_2 = (conf_lower*mean_1-t_score*(combined_se))/mean_1
    return mean_2

observed_means = [calc_observed_mean_2(mean, breakeven, combined_se, t_score) for breakeven in breakevens]
observed_means

[0.2188893513658918, 0.2968893513658918]

### For the 4% treatment group, if we see the breakeven point (13.1%) drop out of the 95% confidence interval, we should turn off that treatment. Based on historical data for estimating 5 days worth of data, that will happen if our observed mean is 21.7% lower than the control (assuming treatment does not change the standard deviation)
### For the 7% treatment group, if we see the breakeven point (20.9%) drop out of the 95% confidence interval, we should turn off that treatment. Based on historical data for estimating 5 days worth of data, that will happen if our observed mean is 29.7% lower than the control (assuming treatment does not change the standard deviation)