In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import seaborn as sns
import datetime

## Get 180 Day Data

In [3]:
data = pd.read_csv('../data.csv')
data['bucketing_date'] = pd.to_datetime(data['bucketing_date'])
data.head(3)

Unnamed: 0,requester_id,bucketing_date,gmv
0,7755479,2018-10-08 13:40:40,598.0
1,6015079,2018-10-08 19:27:38,102.0
2,7163831,2018-10-09 12:24:43,332.0


#### Estimate samples per day based on last 7 days

In [4]:
samples_per_day = 2712 #from guardrails.ipynb

In [5]:
def sample_data(df, metric, samples, simulations):
    """
    given a dataframe `df` with a metric of interst column `metric`,
    select `samples` number of samples,
    repeat for `simulations` number of simulations
    
    Returns:
        meanOfMeans (float)
        seOfMeans (float)
    """
    metric_series = df[metric]
    means = np.array([metric_series.sample(n=samples, replace=True).mean() for i in range(simulations)])
    return {'meanOfMeans': means.mean(), 'SEOfMeans': means.std(), 'samples': samples}

In [6]:
days = [30,45,60]
dist_stats = [sample_data(data, 'gmv', int(samples_per_day*day*(2/3)*(1/2)), 1000) for day in days]

In [7]:
dist_stats

[{'meanOfMeans': 146.89160374483774,
  'SEOfMeans': 1.6655195821035258,
  'samples': 27120},
 {'meanOfMeans': 146.88601817748275,
  'SEOfMeans': 1.3901359866296743,
  'samples': 40680},
 {'meanOfMeans': 146.85727731213126,
  'SEOfMeans': 1.1999269009669387,
  'samples': 54240}]

In [18]:
from statsmodels.stats.power import tt_ind_solve_power

In [34]:
def calculate_mde(dist_stats):
    nobs = dist_stats['samples']
    mean = dist_stats['meanOfMeans']
    se = dist_stats['SEOfMeans']
    combined_se = ((se**2)+(se**2))**.5
    value = tt_ind_solve_power(effect_size=None, 
                               nobs1=nobs, 
                               alpha=.05, 
                               power=.95, 
                               ratio=1.0, 
                               alternative='two-sided')
    drop_pct = 1 - (mean-(value*(combined_se*(nobs**0.5))))/mean 
    return drop_pct

In [35]:
drop_pct = [calculate_mde(stat) for stat in dist_stats]
drop_pct

[0.08175306627048029, 0.06823367511508271, 0.05890372034649405]

In [31]:
pd.DataFrame({'days_of_bucketing': days, 'samples': [x['samples'] for x in dist_stats]
              , 'minimum_detectable_effect':[round(x*100,2) for x in drop_pct]})

Unnamed: 0,days_of_bucketing,samples,minimum_detectable_effect
0,30,27120,8.18
1,45,40680,6.82
2,60,54240,5.89
