In [1]:
import sys
sys.path.append('../')
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from roverdata.db import DataWarehouse
from taxes_increase import utils

## Get LTV Data

In [2]:
LTV = pd.read_csv('../LTV.csv')
LTV180 = LTV.loc[LTV.dates == 180]
new = LTV180.loc[LTV.new_repeat == 'new']
new_account = LTV180.loc[LTV.new_account == 'new']

## Get Sample Size Data

In [6]:
samples = pd.read_csv('../retrans.csv')
samples_new = samples.loc[samples['new_repeat']=='new'][['service','num_owners']].groupby(['service']).sum()
samples_new_account = samples.loc[samples['new_account']=='new'][['service','num_owners']].set_index('service')

In [7]:
def project_30_day_samples(df, samples_col):
    """from df with column `service` and inputted samples column (containing 30 day samples)
    , project out 45 and 60 days"""
    df = df[samples_col].reset_index().rename(columns={samples_col:'30_day'})
    df['45_day'] = (df['30_day']*1.5).astype(int)
    df['60_day'] = (df['30_day']*2).astype(int)
    return df.set_index('service')


expected_xx_day_samples_new = project_30_day_samples(df = samples_new, samples_col = 'num_owners')
expected_xx_day_samples_new_account = project_30_day_samples(df = samples_new_account, samples_col = 'num_owners')

## Estimate Distribution of Sample Statistic

#### Estimate mean of means and standard error of means of sample statistic through simulation
http://blog.analytics-toolkit.com/2017/statistical-significance-non-binomial-metrics-revenue-time-site-pages-session-aov-rpu/

In [8]:
def sample_data(df, metric, samples, simulations):
    """
    given a dataframe `df` with a metric of interst column `metric`,
    select `samples` number of samples,
    repeat for `simulations` number of simulations
    
    Returns:
        meanOfMeans (float)
        seOfMeans (float)
    """
    metric_series = df[metric]
    means = np.array([metric_series.sample(n=samples, replace=True).mean() for i in range(simulations)])
    return means.mean(), means.std()

In [9]:
def run_sample_data(df, expected_xx_day_samples):
    """loop through the services, metrics, and sample sizes and estimate mean of means and std of means"""
    dist_stats_dict = {}
    
    for service in df.service.unique():
        df_service = df.loc[df['service'] == service]
        samples = expected_xx_day_samples.loc[service].to_dict()
        for days, samples in samples.items():
            for metric in ['gmv', 'nrt', 'stays']:
                mom, seom = sample_data(df_service, metric, samples, 1000)
                dist_stats_dict[(service, metric, days)] = (samples, mom, seom)  
                
    return pd.DataFrame.from_dict(dist_stats_dict, orient='index', columns=['samples','meanOfMeans','SEOfMeans'])

In [10]:
dist_stats_new = run_sample_data(
    df=new, 
    expected_xx_day_samples = expected_xx_day_samples_new
)
dist_stats_new_account = run_sample_data(
    df=new_account, 
    expected_xx_day_samples = expected_xx_day_samples_new_account
)

## Calculate minimum effect size for 95% power at 95% significance

In [17]:
from statsmodels.stats.power import tt_ind_solve_power

In [41]:
def dict_to_df(d, col_name):
    """convert dictionary with key = (service, metric, sample_days) to a pandas dataframe 
    with columns = col_name"""
    df = pd.DataFrame.from_dict(d, orient='index').rename(columns={0:col_name})
    midx = pd.MultiIndex.from_tuples(df.index.values.tolist(), names =('service', 'metric', 'samples')) 
    df = df.set_index(midx)
    df = df.pivot_table(index = ['service','samples'],  columns = ['metric'], values = col_name)
    return df

In [83]:
def calculate_mde(dist_stats):
    """Given the mean of means and standard deviation of means,
    Calculated the difference in means that would yeild 95% power at 95% significance
    """
    drop_dict = {}
    value = None

    for row in dist_stats.iterrows():
        index = row[0]
        data = row[1]
        nobs = int(data.samples*(2/3)*(1/2))
        mean = data.meanOfMeans
        se = data.SEOfMeans
        value = tt_ind_solve_power(effect_size=None, 
                                   nobs1=nobs, 
                                   alpha=.05, 
                                   power=.95, 
                                   ratio=1.0, 
                                   alternative='two-sided')
        drop_pct = 1 - (mean-(value*(se*(nobs**0.5))))/mean 
        drop_dict[index] = drop_pct
        
    drop_df = dict_to_df(drop_dict, 'drop')
    return drop_df*100


### MDE: New Customers

In [84]:
calculate_mde(dist_stats_new)

Unnamed: 0_level_0,metric,gmv,nrt,stays
service,samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
*,30_day,5.703426,5.675866,5.964617
*,45_day,4.560308,4.679963,4.89431
*,60_day,4.062493,4.147909,4.454377
dog-walking,30_day,15.06062,15.544428,13.4615
dog-walking,45_day,12.594551,12.797581,11.137616
dog-walking,60_day,11.187965,10.734342,9.406112
drop-in,30_day,13.154898,12.87038,12.875805
drop-in,45_day,10.782057,10.439293,10.791877
drop-in,60_day,9.787818,9.169203,9.067103
overnight,30_day,7.031245,6.943036,7.151522


### MDE: New Accounts

In [80]:
calculate_mde(dist_stats_new_account)

Unnamed: 0_level_0,metric,gmv,nrt,stays
service,samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
*,30_day,7.618577,7.350411,7.958499
*,45_day,6.19891,6.329809,6.578455
*,60_day,5.591691,5.460661,5.61584
dog-walking,30_day,19.978164,19.470232,17.23253
dog-walking,45_day,15.623693,15.207359,13.025398
dog-walking,60_day,13.63041,13.417691,12.431043
drop-in,30_day,17.061975,16.493623,16.372328
drop-in,45_day,13.344504,13.71864,13.171841
drop-in,60_day,11.694684,11.626603,11.678117
overnight,30_day,9.544478,9.123178,9.063442


## Calculate power for 6% drop at 95% significance

In [89]:
def calculate_power(dist_stats):
    """Given the mean of means and standard deviation of means,
    Calculated the power given a minimum effect size of 6% and 95% significance
    """

    power_dict = {}
    value = None

    for row in dist_stats.iterrows():
        index = row[0]
        data = row[1]
        nobs = int(data.samples*(2/3)*(1/2))
        mean = data.meanOfMeans
        se = data.SEOfMeans
        drop_pct = .06
        standardized_effect = (mean-mean*(1-drop_pct))/(se*(nobs**0.5))
        value = tt_ind_solve_power(effect_size=standardized_effect, 
                                   nobs1=nobs, 
                                   alpha=.05, 
                                   power=None, 
                                   ratio=1.0, 
                                   alternative='two-sided')
        power_dict[index] = value
    
    power_df = dict_to_df(power_dict, 'power')
    return power_df*100


### Power: New Customers

In [87]:
calculate_power(dist_stats_new)

Unnamed: 0_level_0,metric,gmv,nrt,stays
service,samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
*,30_day,96.65548,96.790256,95.217731
*,45_day,99.730831,99.611472,99.304282
*,60_day,99.961557,99.943132,99.810664
dog-walking,30_day,30.053251,28.523285,36.213002
dog-walking,45_day,40.42561,39.37517,49.286531
dog-walking,60_day,48.938805,52.195376,63.28924
drop-in,30_day,37.623653,39.008737,38.981612
drop-in,45_day,51.840622,54.458745,51.767898
drop-in,60_day,59.866463,65.503976,66.477237
overnight,30_day,86.782651,87.600729,85.643534


### Power: New Accounts

In [88]:
calculate_power(dist_stats_new_account)

Unnamed: 0_level_0,metric,gmv,nrt,stays
service,samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
*,30_day,81.029252,83.708485,77.569085
*,45_day,93.689036,92.744564,90.789042
*,60_day,97.180972,97.729824,97.071777
dog-walking,30_day,19.132596,19.898332,24.110768
dog-walking,45_day,28.284962,29.574571,38.243845
dog-walking,60_day,35.472295,36.410959,41.303431
drop-in,30_day,24.499959,25.882908,26.196141
drop-in,45_day,36.742562,35.093741,37.544302
drop-in,60_day,45.607631,46.036995,45.711606
overnight,30_day,62.025725,65.938734,66.509742
