In [1]:
import sys
sys.path.append('../')
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from roverdata.db import DataWarehouse
from taxes_increase import utils

## Get LTV Data

In [2]:
LTV = pd.read_csv('../LTV.csv')

#### 180 Day LTV Data

In [3]:
LTV180 = LTV.loc[LTV.dates == 180]

#### 180 LTV Data for New Customers

In [4]:
new = LTV180.loc[LTV180.new_repeat == 'new']

#### 180 LTV Data for New Account Customers

In [5]:
new_account = LTV180.loc[LTV180.new_account == 'new']

## Get Sample Size Data

In [6]:
samples = pd.read_csv('../retrans.csv')
samples_new = samples.loc[samples['new_repeat']=='new'][['service','num_owners']].groupby(['service']).sum()
samples_new_account = samples.loc[samples['new_account']=='new'][['service','num_owners']].set_index('service')

In [7]:
def project_30_day_samples(df, samples_col):
    """from df with column `service` and inputted samples column (containing 30 day samples)
    , project out 45 and 60 days"""
    df = df[samples_col].reset_index().rename(columns={samples_col:'30_day'})
    df['45_day'] = (df['30_day']*1.5).astype(int)
    df['60_day'] = (df['30_day']*2).astype(int)
    return df.set_index('service')


expected_xx_day_samples_new = project_30_day_samples(df = samples_new, samples_col = 'num_owners')
expected_xx_day_samples_new_account = project_30_day_samples(df = samples_new_account, samples_col = 'num_owners')

## Estimate Distribution of Sample Statistic

#### Estimate mean of means and standard error of means of sample statistic through simulation
http://blog.analytics-toolkit.com/2017/statistical-significance-non-binomial-metrics-revenue-time-site-pages-session-aov-rpu/

In [8]:
def sample_data(df, metric, samples, simulations):
    """
    given a dataframe `df` with a metric of interst column `metric`,
    select `samples` number of samples,
    repeat for `simulations` number of simulations
    
    Returns:
        meanOfMeans (float)
        seOfMeans (float)
    """
    metric_series = df[metric]
    means = np.array([metric_series.sample(n=samples, replace=True).mean() for i in range(simulations)])
    return means.mean(), means.std()

In [9]:
def run_sample_data(df, expected_xx_day_samples):
    """loop through the services, metrics, and sample sizes and estimate mean of means and std of means"""
    dist_stats_dict = {}
    
    for service in df.service.unique():
        df_service = df.loc[df['service'] == service]
        samples = expected_xx_day_samples.loc[service].to_dict()
        for days, samples in samples.items():
            for metric in ['gmv', 'nrt', 'stays']:
                mom, seom = sample_data(df_service, metric, samples, 1000)
                dist_stats_dict[(service, metric, days)] = (samples, mom, seom)  
                
    return pd.DataFrame.from_dict(dist_stats_dict, orient='index', columns=['samples','meanOfMeans','SEOfMeans'])

In [10]:
dist_stats_new = run_sample_data(
    df=new, 
    expected_xx_day_samples = expected_xx_day_samples_new
)
dist_stats_new_account = run_sample_data(
    df=new_account, 
    expected_xx_day_samples = expected_xx_day_samples_new_account
)

## Find control-variant differneces that would be significant (alpha = 95%)

In [11]:
from scipy.stats import t

def independent_ttest(mean, se, samples, alpha, diff):
    mean1, mean2 = mean, mean*(1-diff)
    sem = se
    t_stat = (mean2 - mean1) / sem
    df = samples - 1
    cv = t.ppf(1.0 - alpha, df)
    pval = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
    return t_stat, df, cv, pval

In [12]:
def run_independent_ttest(dist_stats):
    """given estimates of the mean of means and se of means for each service/metric/sample size combo, 
    estimate minimum difference in means that would result in a significant result"""
    pval_dict = defaultdict(list)

    for index, row in dist_stats.iterrows():
        samples = row.samples*(2/3)
        mean = row.meanOfMeans
        se = row.SEOfMeans
        for diff in np.arange(.005, .15, .001):
            t_stat, df, cv, pval = independent_ttest(
                mean = mean, 
                se = se, 
                samples = samples*(1/2),
                alpha = .95,
                diff = diff
            )
            if pval < 0.05:
                pval_dict[(index)].append((diff, pval))
                break
                
    df = pd.DataFrame.from_dict(pval_dict, orient='index').reset_index()
    df[['service', 'metric', 'samples']] = pd.DataFrame(df['index'].tolist(), index=df.index) 
    df[['drop', 'pval']] = pd.DataFrame(df[0].tolist(), index=df.index) 
    df.drop(columns = ['index', 0], inplace=True)
    return df.pivot_table(index=['samples', 'service'], columns=['metric'], values = 'drop')


## New Customers % Drops for Significance

In [13]:
expected_xx_day_samples_new.columns.name = 'samples'
reshaped_samples_new = expected_xx_day_samples_new.stack().to_frame().sort_index().rename(columns = {0:'expected_samples'})

In [14]:
sig_drops_new = reshaped_samples_new.join(run_independent_ttest(dist_stats_new)*100, on = ['samples', 'service'])
sig_drops_new

Unnamed: 0_level_0,Unnamed: 1_level_0,expected_samples,gmv,nrt,stays
service,samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
*,30_day,36150,2.3,2.3,2.4
*,45_day,54225,1.9,1.9,1.9
*,60_day,72300,1.7,1.6,1.7
dog-walking,30_day,3766,6.0,6.2,5.3
dog-walking,45_day,5649,4.8,5.0,4.2
dog-walking,60_day,7532,4.3,4.2,3.7
drop-in,30_day,6403,5.3,5.1,5.3
drop-in,45_day,9604,4.2,4.2,4.0
drop-in,60_day,12806,3.6,3.6,3.6
overnight,30_day,24635,2.7,2.7,2.7


## New Accounts % Drops for Significance

In [15]:
expected_xx_day_samples_new_account.columns.name = 'samples'
reshaped_samples_new_account = expected_xx_day_samples_new_account.stack().to_frame().sort_index().rename(columns = {0:'expected_samples'})

In [16]:
sig_drops_new_account = reshaped_samples_new_account.join(run_independent_ttest(dist_stats_new_account)*100, on = ['samples', 'service'])
sig_drops_new_account

Unnamed: 0_level_0,Unnamed: 1_level_0,expected_samples,gmv,nrt,stays
service,samples,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
*,30_day,22334,3.0,2.9,3.1
*,45_day,33501,2.4,2.5,2.6
*,60_day,44668,2.2,2.1,2.2
dog-walking,30_day,2459,7.7,7.5,6.7
dog-walking,45_day,3688,6.1,5.9,5.1
dog-walking,60_day,4918,5.3,5.2,4.8
drop-in,30_day,4132,6.6,6.4,6.3
drop-in,45_day,6198,5.2,5.3,5.1
drop-in,60_day,8264,4.5,4.5,4.5
overnight,30_day,14797,3.7,3.6,3.5
