In [9]:
import sys
sys.path.append('../')
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from roverdata.db import DataWarehouse
from taxes_increase import utils

## Get LTV Data

In [10]:
LTV = pd.read_csv('../LTV.csv')

#### 180 Day LTV Data

In [11]:
LTV180 = LTV.loc[LTV.dates == 180]

#### 180 LTV Data for New Customers

In [12]:
new = LTV180.loc[LTV.new_repeat == 'new']

#### 180 LTV Data for New Account Customers

In [13]:
new_account = LTV180.loc[LTV.new_account == 'new']

In [14]:
new.head()

Unnamed: 0,requester_id,service,new_repeat,new_account,first_observed_stay_added,dates,stays,nrt,gmv
1,7312458,dog-walking,new,old,2018-08-25 13:47:58.000000,180,25,355.05,1315.0
14,7330304,*,new,old,2018-08-27 23:06:30.000000,180,16,234.9,870.0
119,7400397,dog-walking,new,new,2018-09-01 13:46:51.000000,180,1,18.9,70.0
120,7347851,*,new,new,2018-08-27 02:07:27.000000,180,22,513.27,1901.0
131,7377501,*,new,new,2018-08-29 21:47:50.000000,180,1,12.42,46.0


## Simulate 3 Bucket Experiment

In [25]:
def zeros(df, diff):
    """
    zero-out spending from random customers, until the difference in GMV 
    is approximately equal to the inputted difference
    
    Args:
        df (pandas.DataFrame): contains simulated "variation" data
        diff (float): difference to drop the variation by
    
    Returns:
        pandas.DataFrame: containing variation data dropped by diff%
    """
    starting_total = df.sum()
    removal_estimate = starting_total*diff/df[df != 0].mean()
    row = df.loc[df != 0].sample(n=int(removal_estimate))
    df.at[row.index.values] = 0
    return df

In [16]:
def flat(df, diff):
    """
    shift all the customers who spent down by diff%
    
    Args:
        df (pandas.DataFrame): contains simulated "variation" data
        diff (float): difference to drop the variation by
    
    Returns:
        pandas.DataFrame: containing variation data dropped by diff%
    """
    return df*(1-diff)

In [21]:
def simulate_mann_whitney(df, metric, split, diff, drop_method, simulations):
    """
    given historical data, metric, and split proportions, 
    estimate the power of a mann-whitney u-test using an inputted number of simulations
    
    Args: 
        df (pandas.DataFrame): historical data, should have column `service` and metric (see below)
        metric (string): metric to be used to mann-whitney test, should be a column in df
        split(list): list of split proportions
        diff (float): difference to drop the variation by
        drop_method (function): either `zeros` or `flat`, 
            describes the strategy by which the variation will be reduced by diff%
    
    Returns:
        (dictionary): for each service, what is the power of our test at 95% significance
    """
    significant_values = defaultdict(int)
    for service in df.service.unique():
        df_service = df.loc[df['service'] == service]
        for simulation in range(simulations):
            df_dict = utils.split_data(df_service, split)
            for variation in range(2, len(split)+1):
                control = df_dict[1][metric]
                variation = drop_method(df_dict[variation][metric], diff)
                statistic, pvalue = mannwhitneyu(control,
                                                 variation, 
                                                 alternative='greater')
                if pvalue <= 0.05:
                    significant_values[service] += 1/(simulations*(len(split)-1))
    return significant_values

## Estimate Power

In [22]:
metrics = ['gmv', 'nrt']

#### New Customers, Power for 2% Drop Flat Rate

In [23]:
power_data = [simulate_mann_whitney(new, metric, [1/3,1/3,1/3], 0.02, flat, 1000) for metric in metrics]
new_df = pd.DataFrame(power_data, index=metrics)
new_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.073,0.1095,0.0795,0.0905
nrt,0.074,0.115,0.066,0.088


#### New Customers, Power for 2% Drop Zeros

In [26]:
power_data = [simulate_mann_whitney(new, metric, [1/3,1/3,1/3], 0.02, zeros, 1000) for metric in metrics]
new_df = pd.DataFrame(power_data, index=metrics)
new_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.1535,0.4405,0.1605,0.299
nrt,0.182,0.461,0.1755,0.2865


#### New Customers, Power for 5% Drop Flat Rate

In [27]:
power_data = [simulate_mann_whitney(new, metric, [1/3,1/3,1/3], 0.05, flat, 1000) for metric in metrics]
new_df = pd.DataFrame(power_data, index=metrics)
new_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.123,0.237,0.118,0.1525
nrt,0.1225,0.242,0.111,0.151


#### New Customers, Power for 5% Drop Zeros

In [28]:
power_data = [simulate_mann_whitney(new, metric, [1/3,1/3,1/3], 0.05, zeros, 1000) for metric in metrics]
new_df = pd.DataFrame(power_data, index=metrics)
new_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.5185,0.988,0.5475,0.8755
nrt,0.51,0.982,0.563,0.8885


#### New Accounts, Power for 2% Drop Flat Rate

In [29]:
power_data = [simulate_mann_whitney(new_account, metric, [1/3,1/3,1/3], 0.02, flat, 1000) for metric in metrics]
new_account_df = pd.DataFrame(power_data, index=metrics)
new_account_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.066,0.08,0.073,0.081
nrt,0.0695,0.0815,0.0605,0.0745


#### New Accounts, Power for 2% Drop Zeros

In [30]:
power_data = [simulate_mann_whitney(new_account, metric, [1/3,1/3,1/3], 0.02, zeros, 1000) for metric in metrics]
new_account_df = pd.DataFrame(power_data, index=metrics)
new_account_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.136,0.3105,0.149,0.2105
nrt,0.1145,0.314,0.138,0.202


#### New Accounts, Power for 5% Drop Flat Rate

In [33]:
power_data = [simulate_mann_whitney(new_account, metric, [1/3,1/3,1/3], 0.05, flat, 1000) for metric in metrics]
new_account_df = pd.DataFrame(power_data, index=metrics)
new_account_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.0925,0.158,0.1035,0.1295
nrt,0.0805,0.169,0.1035,0.125


#### New Accounts, Power for 5% Drop Zeros

In [34]:
power_data = [simulate_mann_whitney(new_account, metric, [1/3,1/3,1/3], 0.05, zeros, 1000) for metric in metrics]
new_account_df = pd.DataFrame(power_data, index=metrics)
new_account_df

Unnamed: 0,dog-walking,*,drop-in,overnight
gmv,0.334,0.897,0.3915,0.676
nrt,0.3615,0.904,0.398,0.683
