In [1]:
import sys
sys.path.append('..')

import copy
import os
from datetime import datetime
from typing import Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

from scripts import ab_test, perform_ab

DATA_DIR = os.path.join('..', 'data')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#Grab the February outcomes for employees enrolled at the baseline only. This is our confirmatory data

enrolled_baseline = pd.read_csv(os.path.join(DATA_DIR,'Feb_enrolled_at_baseline.csv'), dtype={'EMPLID': str})

  interactivity=interactivity, compiler=compiler, result=result)


### Prep for Bootstrapping

In [3]:
#create two functions to be applied to our dataframe

def calculate_annual_contribution(df: pd.DataFrame) -> pd.Series:
    """
    Calculate the total annualized contribution from the data frame.
    Some people contribute a flat amount per paycheck. Some people
    contribute a flat amount per paycheck, so we have to do some
    switching:
        * If percent contributed per paycheck is positive,
          return percent * salary
        * If the number of paychecks with a contribution is <= 26,
          we assume the person hit the maximum contribution before
          the end of the year, and so return the maximum contribution.
        * If the number of paychecks is more than 26, we're bleeding over
          the data and so return 26 * the flat rate people pay.
    """
        
    #Kevin's
    answer_pct = df['PERCENTAGE_0218'] / 100 * df['Annual Rt']
    answer_max = df['max_contribution']
    answer_flat = df['FLATAMOUNT_0218'] * 26
    
    answer = answer_pct
    done_filter = df['PERCENTAGE_0218'] > 0
    answer *= done_filter
    
    this_filter = (df['num_paychecks_0218'] == 0) & ~done_filter
    answer[this_filter] = 0
    done_filter |= this_filter
    
    this_filter = (df['num_paychecks_0218'] <= 26) & ~done_filter
    answer += answer_max * this_filter
    done_filter |= this_filter
    
    this_filter = ~done_filter
    answer += answer_flat * this_filter
    return answer

          
def calculate_savings_rt(df: pd.DataFrame) -> pd.Series:
    """
    Return the total savings rate based on salary *at the time
    we pulled the data*.
    """
    answer = (df['PERCENTAGE_0218'] / 100) * (df['PERCENTAGE_0218'] > 0)
    answer += (df['ANNUAL_FLAT_AMT'] / df['Annual Rt']) * (df['PERCENTAGE_0218'] <= 0) * 100
    return answer

In [4]:
enrolled_baseline['PERCENTAGE_0218'] = enrolled_baseline['PERCENTAGE_0218'].fillna(0)
# Apply the functions to our dataframe
enrolled_baseline['ANNUAL_FLAT_AMT'] = calculate_annual_contribution(enrolled_baseline)
enrolled_baseline['SAVINGS_RATE'] = calculate_savings_rt(enrolled_baseline)

In [5]:
def split_groups(df: pd.DataFrame, combine_arms = False) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the data into treatment and control groups
    
    If combine_arms = False, returns tx arms separately
    If combine_arms = True, returns combined arms
    
    Returns:
        Either a two tuple of data frames: control and treatment, or
        A three tuple of data frames: control, basic, simple_choice
    """
    control = df[df.treatment_real == 0]
    basic = df[df.treatment_real == 1]
    simple_choice = df[df.treatment_real == 2]
    if combine_arms == False:
        return control, basic, simple_choice
    elif combine_arms == True:
        treatment = pd.concat([basic, simple_choice])
        return control, treatment


In [6]:
#split data into treatment and control
control_2, treatment_2 = perform_ab.split_only2_groups(enrolled_baseline)

control_3, basic_3, simple_choice_3 =  perform_ab.split_groups(enrolled_baseline)

In [7]:
treatment_2.shape
control_2.shape
control_3.shape

(9228, 104)

(4627, 104)

(4627, 104)

# Test Time

##### 2 functions here: 

    - get_estimate
          - A: control 
          - B: treatment
          - draws: how many samples are drawn
          - percentile: what percentile we wanted to draw from
          
    - run_continuous_ab_no_distrib
          - draws: how many samples are drawn 
          - n_iterations: Big "S" in our equation. how many times we want to run the trials
          - percentile: what percentile we wanted to draw from
          - col_of_interest: wanted the ability to plug in either 'SAVINGS_RATE' or 'ANNUAL_FLAT_AMT' (didn't decide yet)
          - n_treat_groups: just put in how many groups there are, for the purposes of spitting out info
              2 - for control group versus either email
              3 - for basic email versus simplified choice email

In [8]:
def get_estimate(A: np.ndarray, B: np.ndarray,
                 draws: int, percentile: float,
                 n_iterations: int,
                 seed: Optional[int] = 1234
                 ) -> Tuple[np.ndarray, np.ndarray]:
    """
    Draw `draws` number of samples (with replacement) from `A` (control)
    and `B` (treatment) and return the `percentile` value from those draws.
    Does this `n_iterations` times and returns two arrays of size
    `n_iterations` with the answers.
    """
    
    random = np.random.RandomState(seed)
    a_ = random.choice(A, replace=True, size=(draws, n_iterations))
    b_ = random.choice(B, replace=True, size=(draws, n_iterations))

    # From the x samples (a_, b_) as drawn from our orignal set, get the percentile
    r_a = np.percentile(a_, percentile, axis=1)
    r_b = np.percentile(b_, percentile, axis=1)
    
    return r_a, r_b

In [9]:
def run_continuous_ab_no_distrib(draws: int, 
                                 n_iterations: int, 
                                 percentile: float,
                                 col_of_interest: str, 
                                 n_treat_groups: int) -> pd.DataFrame:
    
    #Determine what is considered treatment and control groups
    if n_treat_groups == 2:
        control = control_2
        treatment = treatment_2
        print('A: Control group, no email')
        print('B: Treatment group, either email')
        
    else:
        control = basic_3
        treatment = simple_choice_3
        print('A is the basic email')
        print('B is the simplified choice email')
        
    print()
    print('Length of A', len(control))
    print('Length of B', len(treatment))
    print()
    print('Number of Draws:', draws)
   
    
    ### ------------------------------------- DRAWS ------------------------------------------- ###    
    # initialize the vectors for which we will be dumping our data into
    a_array, b_array = get_estimate(A=control[col_of_interest].values, 
                                    B=treatment[col_of_interest].values, 
                                    draws=draws, 
                                    percentile=percentile,
                                    n_iterations=n_iterations)

    # Convert results to dataframe
    df = pd.DataFrame({'control': a_array, 'treatment': b_array})

    # Create a column of booleans for the whether b is greater than a 
    df['b>a'] = df.treatment > df.control
    df['a>b'] = df.control > df.treatment # just out of curiousity
    
    print()
    print('Probability that B > A:', np.mean(df['b>a']))
    print('Average Savings, A:', round(np.mean(df.control), 5))
    print('Average Savings, B:', round(np.mean(df.treatment), 5))
    
    return df

# Savings Rate

#### RQ5: Among employees continuing their enrollments in 457(b) accounts, how likely is it that the median savings is greater among those who receive a behaviorally-informed email than among those who do not?

In [10]:
RQ5_flat = run_continuous_ab_no_distrib(draws=len(control_2), 
                                        n_iterations=10000, 
                                        percentile=50, 
                                        col_of_interest='FLATAMOUNT_0218', 
                                        n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.0
Average Savings, A: 100.0
Average Savings, B: 100.0


In [11]:
RQ5_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=50, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.402204452129
Average Savings, A: 3.32845
Average Savings, B: 3.32014


In [12]:
RQ5_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=75, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.726820834234
Average Savings, A: 6.69272
Average Savings, B: 6.77638


In [13]:
RQ5_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=80, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.567970607305
Average Savings, A: 8.02572
Average Savings, B: 8.04934


In [17]:
RQ5_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=90, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.881996974281
Average Savings, A: 12.37536
Average Savings, B: 12.71483


In [18]:
RQ5_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=85, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.523449319213
Average Savings, A: 9.85528
Average Savings, B: 9.85961


In [19]:
RQ5_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=95, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4627
Length of B 9228

Number of Draws: 4627

Probability that B > A: 0.630646207046
Average Savings, A: 16.53077
Average Savings, B: 16.63191


#### RQ6: Among employees continuing their enrollments in 457(b) accounts, how likely is it that the median savings is greater among those who receive an email with a simplified choice rather than one without the simplification?

In [43]:
RQ6_flat = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=50, 
                                      col_of_interest='FLATAMOUNT_0218', 
                                      n_treat_groups = 3)

A is the basic email
B is the simplified choice email

Length of A 4603
Length of B 4629

Number of Draws: 4818

Probability that B > A: 0.0
Average Savings, A: 100.0
Average Savings, B: 100.0


In [44]:
RQ6_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=50, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 3)

A is the basic email
B is the simplified choice email

Length of A 4603
Length of B 4629

Number of Draws: 4818

Probability that B > A: 0.272312162723
Average Savings, A: 3.33458
Average Savings, B: 3.30449


#### RQ7: Among employees continuing their enrollments in 457(b) accounts, how likely is it that the 25th percentile of savings is greater among those who receive a behaviorally-informed email than among those who do not?

In [45]:
RQ7_flat = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=25, 
                                      col_of_interest='FLATAMOUNT_0218', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4818
Length of B 9232

Number of Draws: 4818

Probability that B > A: 0.0267745952677
Average Savings, A: 49.90118
Average Savings, B: 49.97014


In [46]:
RQ7_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=25, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 2)

A: Control group, no email
B: Treatment group, either email

Length of A 4818
Length of B 9232

Number of Draws: 4818

Probability that B > A: 0.682855956829
Average Savings, A: 1.59226
Average Savings, B: 1.61003


#### RQ8: Among employees continuing their enrollments in 457(b) accounts, how likely is it that the 25th percentile of savings is greater among those who receive an email with simplified choice rather than one without the simplification?

In [47]:
RQ8_flat = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=25, 
                                      col_of_interest='FLATAMOUNT_0218', 
                                      n_treat_groups = 3)

A is the basic email
B is the simplified choice email

Length of A 4603
Length of B 4629

Number of Draws: 4818

Probability that B > A: 0.00249066002491
Average Savings, A: 49.99108
Average Savings, B: 49.93982


In [48]:
RQ8_rate = run_continuous_ab_no_distrib(draws=len(control_2), 
                                      n_iterations=10000, 
                                      percentile=25, 
                                      col_of_interest='SAVINGS_RATE', 
                                      n_treat_groups = 3)

A is the basic email
B is the simplified choice email

Length of A 4603
Length of B 4629

Number of Draws: 4818

Probability that B > A: 0.579078455791
Average Savings, A: 1.60584
Average Savings, B: 1.61425
