In [1]:
import sys
sys.path.append('..')
import os.path
import copy
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

DATA_DIR = os.path.join('..', 'data')

In [2]:
# Grab the February outcomes. This is our confirmatory data
confirmatory = pd.read_csv(os.path.join(DATA_DIR, 'confirmatory_outcomes_public.csv'))
exploratory = pd.read_csv(os.path.join(DATA_DIR, 'exploratory_outcomes_public.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#Take a look at what February enrollment data looks like
pd.crosstab(confirmatory.treatment_real, confirmatory.FEB18THENROLLMENT, dropna=False, margins=True, margins_name='Total')

FEB18THENROLLMENT,ElectedFeb18th,Not Elected,Total
treatment_real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4627,5893,10520
1.0,4600,5861,10461
2.0,4627,5854,10481
Total,13854,17608,31462


In [4]:
#Take a look at what Mar enrollment data looks like
pd.crosstab(exploratory.treatment_real, exploratory.MAR04THENROLLMENT, dropna=False, margins=True, margins_name='Total')

MAR04THENROLLMENT,ElectedMar04th,Not Elected,Total
treatment_real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4610,5876,10486
1.0,4584,5850,10434
2.0,4611,5829,10440
Total,13805,17555,31360


# Data Preparation

### Generate outcome variables for whether or not employee enrolled

In [5]:
def generateOutcomes(df: pd.DataFrame,
                     outcome_bool: str, 
                     enroll_col: str,
                     indicator: str):
    
    '''
    1. Takes in a df, expecting either 'confirmatory' or 'exploratory'
    2. Create a new column named whatever you assign to 'outcome_bool' to store our outcome variable (True if Elected on Date)
    3. Give the column name that gives you enrollment information. Try these: ['FEB18THENROLLMENT', 'MAR04THENROLLMENT']
    4. Give the indicator that is listed. Try these ['ElectedFeb18th', 'ElectedMar04th']
    '''
    #Generate booleans
    #whether they were enrolled at baseline
    df['enrolled_0122'] = df.JAN22NDENROLLMENT == 'ElectedJan22nd'
    
    #whether they were enrolled in treatment month
    df[outcome_bool] = df[enroll_col] == indicator
    
    #whether they were enrolled in Jan but NOT enrolled in treatment month
    df['unenrolled'] = (df['enrolled_0122']) & (df[outcome_bool] == False)
    
    #whether they were not enrolled in Jan but enrolled in treatment month
    df['newly_enrolled'] = (df['enrolled_0122'] == False) & (df[outcome_bool])

In [6]:
# Apply to Feb outcomes
generateOutcomes(df= confirmatory,
                 outcome_bool = 'enrolled_0218', 
                 enroll_col = 'FEB18THENROLLMENT',
                 indicator='ElectedFeb18th')

#Apply to Mar outcomes
generateOutcomes(df = exploratory,
                 outcome_bool = 'enrolled_0304', 
                 enroll_col = 'MAR04THENROLLMENT',
                 indicator='ElectedMar04th')

## Checks
print("There should be no participants classified as both unenrolled and newly enrolled: ")
pd.crosstab(confirmatory.unenrolled, confirmatory.newly_enrolled)
print("This is confirmed (only 1 participant unenrolled and no one newly enrolled)")


print("-----------------------------------------------------------------------------------------")

print("There should be no participants classified as both unenrolled and newly enrolled in the later month: ")
pd.crosstab(exploratory.unenrolled, exploratory.newly_enrolled)
print("This is confirmed (only 1 participant unenrolled and no one newly enrolled)")

There should be no participants classified as both unenrolled and newly enrolled: 
This is confirmed (only 1 participant unenrolled and no one newly enrolled)
-----------------------------------------------------------------------------------------
There should be no participants classified as both unenrolled and newly enrolled in the later month: 
This is confirmed (only 1 participant unenrolled and no one newly enrolled)


### Split datasets into enrolled and not enrolled at the baseline. 

In [7]:
feb_enrolled_baseline = confirmatory.loc[confirmatory.enrolled_0122].copy().reset_index(drop = True)
feb_not_enrolled_baseline = confirmatory.loc[~confirmatory.enrolled_0122].copy().reset_index(drop = True)
mar_enrolled_baseline = exploratory.loc[exploratory.enrolled_0122].copy().reset_index(drop = True)
mar_not_enrolled_baseline = exploratory.loc[~exploratory.enrolled_0122].copy().reset_index(drop = True)

In [8]:
print('Jan Percent of Salary')
print('How many people do pre tax contributions?', sum(feb_enrolled_baseline.PERCENTAGEBEFORETAX_0122 > 0))
print('How many people do post tax contributions?', sum(feb_enrolled_baseline.PERCENTAGEAFTERTAX_0122 > 0))
print()

print('Feb Percent of Salary')
print('How many people do pre tax contributions?', sum(feb_enrolled_baseline.PERCENTAGEBEFORETAX_0218 > 0))
print('How many people do post tax contributions?', sum(feb_enrolled_baseline.PERCENTAGEAFTERTAX_0218 > 0))

print('\n-----------------------------------------------\n')

print('Jan Flat Amount')
print('How many people do pre tax contributions?', sum(feb_enrolled_baseline.FLATAMOUNTBEFORETAX_0122 > 0))
print('How many people do post tax contributions?', sum(feb_enrolled_baseline.FLATAMOUNTAFTERTAX_0122 > 0))

print()
print('Feb Flat Amount')
print('How many people do pre tax contributions?', sum(feb_enrolled_baseline.FLATAMOUNTBEFORETAX_0218 > 0))
print('How many people do post tax contributions?', sum(feb_enrolled_baseline.FLATAMOUNTAFTERTAX_0218 > 0))

Jan Percent of Salary
How many people do pre tax contributions? 55
How many people do post tax contributions? 9

Feb Percent of Salary
How many people do pre tax contributions? 55
How many people do post tax contributions? 9

-----------------------------------------------

Jan Flat Amount
How many people do pre tax contributions? 12633
How many people do post tax contributions? 963

Feb Flat Amount
How many people do pre tax contributions? 12625
How many people do post tax contributions? 970


### Clean



We clean up data for those who contribute $18,000+ or 100% in a paycheck for 2 reasons:

    1) No one in DC government actually earns that much in a pay period
    2) We assume that this is a human error; that when employees choose to contribute that much that they mean that they want to contribute that amount annually, not per paycheck. It is unclear how this is actually treated in PeopleSoft.

In [9]:
#Calculate the maximum allowable contributions
def calculateMaxContribution(df: pd.DataFrame):
    df['max_contribution'] = np.where(df['age_group'] != '50+ yrs', 18500, 24500)

calculateMaxContribution(feb_enrolled_baseline)
calculateMaxContribution(mar_enrolled_baseline)

In [10]:
## flags for people who's contributions are above a certain amount
def flagContributions(df: pd.DataFrame,
                      amount_threshold: int,
                      percentage_threshold: int,
                      baseline_date: str,
                      outcome_date = str):
    
    ## baselines
    df["assume_amounterror_" + baseline_date] = \
        np.where(df['FLATAMOUNTBEFORETAX_' + baseline_date] >= amount_threshold, 1, 0)
    df["assume_percerror_" + baseline_date] = \
        np.where(df['PERCENTAGEBEFORETAX_' + baseline_date] >= percentage_threshold, 1, 0)
                                   
    ## outcomes
    df["assume_amounterror_" + outcome_date] = \
        np.where(df['FLATAMOUNTBEFORETAX_' + outcome_date] >= amount_threshold, 1, 0)
    df["assume_percerror_" + outcome_date] = \
        np.where(df['PERCENTAGEBEFORETAX_' + outcome_date] >= percentage_threshold, 1, 0)
    

In [11]:
flagContributions(mar_enrolled_baseline,
                  amount_threshold = 18000,
                  percentage_threshold = 100,
                  baseline_date = "0122",
                  outcome_date = '0304')

flagContributions(feb_enrolled_baseline,
                  amount_threshold = 18000,
                  percentage_threshold = 100,
                  baseline_date = "0122",
                  outcome_date = '0218')

In [12]:
'''
There were a few people who contributed over $18000 per paycheck. Making the assumption here that 
he/she meant to contribute $18,000 in a year, not in a paycheck. People only do this for the flat 
amounts BEFORE tax, not AFTER. 

For flat tax amounts, if the amount contributed is greater than or equal to $18000, 
take that amount and divide by 26 paychecks, bc I assume that they want that total amount contributed.

Adding a 'b' suffix, because we want to drop them later as sensitivity analysis
'''

def fixContributions(df: pd.DataFrame,
                     baseline_date: str,
                     outcome_date: str):
    
    
    ## Fix flat amounts based on either type of error 
    df['FLATAMOUNTBEFORETAX_' + baseline_date + "b"] = \
        np.where(df["assume_amounterror_" + baseline_date] == 1, 
        round(df['FLATAMOUNTBEFORETAX_' + baseline_date]/26,2), 
        np.where(df["assume_percerror_" + baseline_date] == 1, 
        round(df.max_contribution/26,2),
        df['FLATAMOUNTBEFORETAX_' + baseline_date]))

    df['FLATAMOUNTBEFORETAX_' + outcome_date +'b'] =  \
        np.where(df["assume_amounterror_" + outcome_date] == 1, 
        round(df['FLATAMOUNTBEFORETAX_' + outcome_date]/26,2), 
        np.where(df["assume_percerror_" + outcome_date] == 1, 
        round(df.max_contribution/26,2),
        df['FLATAMOUNTBEFORETAX_' + outcome_date]))
    
    ## Fix percentages for those who are coded as percentage error
    ## code them to 0 percent contribution (after fixing their flat amount)
    df['PERCENTAGEBEFORETAX_' + baseline_date + "b"] = \
        np.where(df["assume_percerror_" + baseline_date] == 1, 
        0,
        df['PERCENTAGEBEFORETAX_' + baseline_date])

    df['PERCENTAGEBEFORETAX_' + outcome_date +'b']= \
        np.where(df["assume_percerror_" + outcome_date] == 1, 
        0,
        df['PERCENTAGEBEFORETAX_' + outcome_date])
    
    #fill in missing values with 0 for after tax. Only in non-Jan data
    df['FLATAMOUNTAFTERTAX_' + outcome_date] = df['FLATAMOUNTAFTERTAX_' + outcome_date].fillna(0)
    df['FLATAMOUNTBEFORETAX_' + outcome_date] = df['FLATAMOUNTBEFORETAX_' + outcome_date].fillna(0)

In [13]:
# Apply to Feb and March

fixContributions(mar_enrolled_baseline,
                 baseline_date = "0122",
                 outcome_date = '0304')

fixContributions(feb_enrolled_baseline,
                 baseline_date = "0122",
                 outcome_date = '0218')

### Did anyone increase contribution amounts?

In [14]:
def calculateContributions(df: pd.DataFrame,
                           date: str):
    '''
    Set up for whether employees increased their salaries.
    We are ignoring the difference between before tax contribution and after 
    tax contributions for percentage contributions because we would have to 
    make assumptions about the type of filers and exemptions that the employee 
    is claiming, which we do not have any information about.
    '''
    
    #Percentage of Salary
    df['PERCENTAGE_0122'] = df['PERCENTAGEBEFORETAX_0122b'] + df['PERCENTAGEAFTERTAX_0122']
    df['PERCENTAGE_' + date] = df['PERCENTAGEBEFORETAX_' + date + 'b'] + df['PERCENTAGEAFTERTAX_' + date]
    #True if Feb/Mar > Jan
    df['increased_pct_amt'] = df['PERCENTAGE_' + date] > df['PERCENTAGE_0122']
    
    
    #Do the same for Flat amounts
    df['FLATAMOUNT_0122'] = df['FLATAMOUNTBEFORETAX_0122b'] + df['FLATAMOUNTAFTERTAX_0122']
    df['FLATAMOUNT_' + date] = df['FLATAMOUNTBEFORETAX_' + date +'b'] + df['FLATAMOUNTAFTERTAX_' + date]
    df['FLATAMOUNT_' + date] = df['FLATAMOUNT_' + date].fillna(0)


    #Calculate the number of paychecks
    #If the number of paychecks is less than 26, then that means that the employee would hit their contribution early 
    #Which means that even if they increased their contribution, they would still get the same amount
    df['num_paychecks_0122'] = \
        np.where(df['FLATAMOUNT_0122'] > 0, 
        np.round((df['max_contribution']/df['FLATAMOUNT_0122']),2),
        0)
    
    df['num_paychecks_' + date] = \
        np.where(df['FLATAMOUNT_' + date] > 0, 
        np.round((df['max_contribution']/df['FLATAMOUNT_' + date]),2),
        0)

In [15]:
def increased(df: pd.DataFrame, 
              date: str):
        
    '''
    There are employees who choose to contribute more of their salary early on. As an interest earning-thing

    Rule of 26:

    - If the number of paychecks is less than 26, then that means that the employee would hit their contribution early 
    - Which means that even if they increased their contribution, they would still get the same amount
    '''
    
    #if they are the same, False
    #this takes care of the 0s as well
    if (df['FLATAMOUNT_0122'] == df['FLATAMOUNT_' + date]):
        return False
    #if Feb is less than Jan, False
    elif df['FLATAMOUNT_' + date] < df['FLATAMOUNT_0122']:
        return False
    #if Feb contribution is greater than Jan
    #then if the flat amount is 0 but the number of paychecks is less than 26, then it's still true, 
    #bc they increased their contribution from nothing
    #However, if both Jan and Feb num of paychecks are less than 26, that means that they hit their contribution early
    elif df['FLATAMOUNT_' + date] > df['FLATAMOUNT_0122']:
        if ((df['FLATAMOUNT_0122'] == 0) & (df['num_paychecks_' + date] < 26)):
            return True
        elif ((df['num_paychecks_0122'] < 26) & (df['num_paychecks_' + date] < 26)):
            return False
        elif ((df['num_paychecks_0122'] == 26) & (df['num_paychecks_' + date] < 26)):
            return False
        else:
            return True

In [16]:
#Not the most necessary; but I wanted to make sure I ended up with the same same numbers. 

def calculateAnnualContribution(df: pd.DataFrame, 
                                  date: str) -> pd.Series:
    """
    Calculate the total annualized contribution from the data frame.
    Some people contribute a flat amount per paycheck. Some people
    contribute a flat amount per paycheck, so we have to do some
    switching:
        * If percent contributed per paycheck is positive,
          return percent * salary
        * If the number of paychecks with a contribution is <= 26,
          we assume the person hit the maximum contribution before
          the end of the year, and so return the maximum contribution.
        * If the number of paychecks is more than 26, we're bleeding over
          the data and so return 26 * the flat rate people pay.
    """
    
    # Note: The "Annual Rt" is removed from the dataset, because The Lab decided that while it's not 
    # technically PII, we want to ensure we're doing all we can to protect the privacy of District employees. 
    
    # The only thing we use "Annual Rt" for is to calculate the annual 457b contributions. 
    # These are included in the output files that result from running this script (last cell). 
    
    answer_pct = df['PERCENTAGE_' + date] / 100 * df['Annual Rt']
    answer_max = df['max_contribution']
    answer_flat = df['FLATAMOUNT_' + date] * 26
    
    answer = answer_pct
    done_filter = df['PERCENTAGE_' + date] > 0
    answer *= done_filter
    
    this_filter = (df['num_paychecks_' + date] == 0) & ~done_filter
    answer[this_filter] = 0
    done_filter |= this_filter
    
    this_filter = (df['num_paychecks_' + date] <= 26) & ~done_filter
    answer += answer_max * this_filter
    done_filter |= this_filter
    
    this_filter = ~done_filter
    answer += answer_flat * this_filter
    return answer

In [17]:
#Apply the function to our dataframes
calculateContributions(feb_enrolled_baseline, '0218')
calculateContributions(mar_enrolled_baseline, '0304')

#Apply the increased functions to our dataframe
feb_enrolled_baseline['increased_flat_amt'] = \
    feb_enrolled_baseline.apply(lambda x: increased(df = x, date = '0218'), axis = 1) 
mar_enrolled_baseline['increased_flat_amt'] = \
    mar_enrolled_baseline.apply(lambda x: increased(df = x, date = '0304'), axis = 1) 


#For some reason, this person shows up as true, even though contributed amounts are equal
for df in [feb_enrolled_baseline, mar_enrolled_baseline]:
    df.loc[(df['hashed_id']=='f72831d94107bd584bec348b50db4b6f0d365436fc285e03c52e32c4f9d7a4db'),
           'increased_flat_amt'] = False

In [18]:
#Apply to Feb
feb_enrolled_baseline['ANNUAL_FLAT_AMT_0122'] = calculateAnnualContribution(feb_enrolled_baseline, '0122')
feb_enrolled_baseline['ANNUAL_FLAT_AMT_0218'] = calculateAnnualContribution(feb_enrolled_baseline, '0218')

#Apply annual amount contributions to 
mar_enrolled_baseline['ANNUAL_FLAT_AMT_0122'] = calculateAnnualContribution(mar_enrolled_baseline, '0122')
mar_enrolled_baseline['ANNUAL_FLAT_AMT_0304'] = calculateAnnualContribution(mar_enrolled_baseline, '0304')


feb_enrolled_baseline['increased_annual'] = \
    feb_enrolled_baseline['ANNUAL_FLAT_AMT_0218'] > feb_enrolled_baseline['ANNUAL_FLAT_AMT_0122']

mar_enrolled_baseline['increased_annual'] = \
    mar_enrolled_baseline['ANNUAL_FLAT_AMT_0304'] > mar_enrolled_baseline['ANNUAL_FLAT_AMT_0122']

#For some reason, this person shows up as true, even though contributed amounts are equal
for df in [feb_enrolled_baseline, mar_enrolled_baseline]:
    df.loc[(df['hashed_id']=='f72831d94107bd584bec348b50db4b6f0d365436fc285e03c52e32c4f9d7a4db'), 
           'increased_annual'] = False

In [19]:
pd.crosstab(feb_enrolled_baseline.treatment_real, feb_enrolled_baseline.increased_flat_amt)

increased_flat_amt,False,True
treatment_real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,4488,139
1.0,4413,188
2.0,4449,178


In [20]:
pd.crosstab(mar_enrolled_baseline.treatment_real, mar_enrolled_baseline.increased_annual)

increased_annual,False,True
treatment_real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,4425,185
1.0,4349,236
2.0,4377,234


In [21]:
print('How many people increased percent contributions? Feb:', sum(feb_enrolled_baseline.increased_pct_amt))
print('How many people increased percent contributions? Mar:', sum(mar_enrolled_baseline.increased_pct_amt))

How many people increased percent contributions? Feb: 0
How many people increased percent contributions? Mar: 0


### Checks for accuracy

In [22]:
#Data quality checks
feb_enrolled_baseline['same'] = \
    feb_enrolled_baseline['FLATAMOUNT_0122'] == feb_enrolled_baseline['FLATAMOUNT_0218']
feb_enrolled_baseline['increased'] = \
    feb_enrolled_baseline['FLATAMOUNT_0122'] < feb_enrolled_baseline['FLATAMOUNT_0218']
feb_enrolled_baseline['decreased'] = \
    feb_enrolled_baseline['FLATAMOUNT_0122'] > feb_enrolled_baseline['FLATAMOUNT_0218']

#To see if we did this correctly, create two new dfs
#One where we increased the flat amount, and the other where we did not
check_increase = feb_enrolled_baseline[feb_enrolled_baseline.increased_flat_amt]
check_false = feb_enrolled_baseline[feb_enrolled_baseline.increased_flat_amt == False]

print('For increase_flat_amt == True')
print('Does the number of increases equal the number of True values in the increased_flat_amt column?', 
      sum(check_increase['increased']) == sum(feb_enrolled_baseline.increased_flat_amt))

print('Does the number of sames equal 0?', sum(check_increase['same']) == 0)

print('Does the number of decreases equal 0?', sum(check_increase['decreased']) == 0)

print()
print('For increased_flat_amt == False')
print('Does the number of Falses equal the number of sames or decreases?', 
      sum((check_false['same']) | (check_false['decreased'])) == len(check_false))
print("What's the difference?",sum((check_false['same']) | (check_false['decreased'])) - len(check_false))

print("Why? See below:")

check_false[~((check_false.same)|(check_false.decreased))][['FLATAMOUNT_0122','FLATAMOUNT_0218']]


For increase_flat_amt == True
Does the number of increases equal the number of True values in the increased_flat_amt column? True
Does the number of sames equal 0? True
Does the number of decreases equal 0? True

For increased_flat_amt == False
Does the number of Falses equal the number of sames or decreases? False
What's the difference? -11
Why? See below:


Unnamed: 0,FLATAMOUNT_0122,FLATAMOUNT_0218
1609,1000.0,1500.0
2007,692.31,692.31
2910,5000.0,6000.0
4252,1423.0,1779.0
4874,950.0,1000.0
5271,1000.0,1100.0
6633,950.0,1020.0
9176,900.0,1000.0
9957,1150.0,1225.0
13178,900.0,1000.0


In [23]:
#Everything looks good, so drop them from our df
feb_enrolled_baseline.drop(columns=['same', 'increased', 'decreased'], axis = 1, inplace=True)

In [24]:
# Booleans if employee increased dollars or percentage of income
feb_enrolled_baseline['increased'] = feb_enrolled_baseline.increased_flat_amt | feb_enrolled_baseline.increased_pct_amt
mar_enrolled_baseline['increased'] = mar_enrolled_baseline.increased_flat_amt | mar_enrolled_baseline.increased_pct_amt

### Consider looking at those who do not withhold anything as "unenrolled"

The goal was to change behavior. We want to see if participants did actually change behavior. 

In [25]:
#Do we want to consider those who do not withold anything as "unenrolled"?

#For alt_enrolled_0107 & alt_enrolled_0218, if all of the contribution types, including flat amounts before/after tax
#and percentage bfore/after tax, are 0, then they are considered not enrolled, otherwise, they follow the same enrollment as
#detailed in enrolled_0107. 

def alternateEnrollment(df: pd.DataFrame, 
                        date: str):
    df['alt_enrolled_0122'] = np.where((df['FLATAMOUNTBEFORETAX_0122'] == 0) & \
                                          (df['FLATAMOUNTAFTERTAX_0122'] == 0) & \
                                          (df['PERCENTAGEBEFORETAX_0122'] == 0) & 
                                          (df['PERCENTAGEAFTERTAX_0122'] == 0), 
                                         False,  
                                         df['enrolled_0122'])

    df['alt_enrolled_' + date] = np.where((df['FLATAMOUNTBEFORETAX_' + date] == 0) & \
                                          (df['FLATAMOUNTAFTERTAX_' + date] == 0) & \
                                          (df['PERCENTAGEBEFORETAX_' + date] == 0) & 
                                          (df['PERCENTAGEAFTERTAX_' + date] == 0), 
                                         False,  
                                         df['enrolled_' + date])

#Apply to data
alternateEnrollment(df = feb_enrolled_baseline, date = '0218')
alternateEnrollment(df = mar_enrolled_baseline, date = '0304')
    

print('Feb confirmatory')
print('If we consider people who did not withold anything to be "unenrolled":\n',
      len(feb_enrolled_baseline[(feb_enrolled_baseline.enrolled_0122) & \
                                (feb_enrolled_baseline.alt_enrolled_0122 == False)]), 
      'more employees would be considered "unenrolled" in January, and \n',
      len(feb_enrolled_baseline[(feb_enrolled_baseline.enrolled_0218) & \
                                (feb_enrolled_baseline.alt_enrolled_0218 == False)]), 
      'more employees would be considered "unenrolled" in February')

print()

print('March exploratory')
print('If we consider people who did not withold anything to be "unenrolled":\n',
      len(mar_enrolled_baseline[(mar_enrolled_baseline.enrolled_0122) & \
                                (mar_enrolled_baseline.alt_enrolled_0122 == False)]), 
      'more employees would be considered "unenrolled" in January, and \n',
      len(mar_enrolled_baseline[(mar_enrolled_baseline.enrolled_0304) & \
                                (mar_enrolled_baseline.alt_enrolled_0304 == False)]), 
      'more employees would be considered "unenrolled" in March')

Feb confirmatory
If we consider people who did not withold anything to be "unenrolled":
 688 more employees would be considered "unenrolled" in January, and 
 690 more employees would be considered "unenrolled" in February

March exploratory
If we consider people who did not withold anything to be "unenrolled":
 686 more employees would be considered "unenrolled" in January, and 
 693 more employees would be considered "unenrolled" in March


## Write files to csvs

In [26]:
# What are the column names originally in the file?

dfs = [feb_enrolled_baseline, feb_not_enrolled_baseline, 
       mar_enrolled_baseline, mar_not_enrolled_baseline, 
       confirmatory, exploratory]    

for df in dfs: 
    print(df.columns)

Index(['hashed_id', 'Agency Code', 'Agency Name', 'Grade', 'Position Title',
       'Annual Rt', 'State', 'Enrollment', 'Eligible', 'Enroll',
       'Contribution', 'Field 2', 'Account', 'Field 1', 'Field 3', 'block',
       'Click0118_1', 'Click0118_2', 'Click0123_1', 'Click0123_2',
       'Click0123_3', 'Click0212_1', 'Click0212_2', 'Click0212_3',
       'Click0212_4', 'Click0212_5', 'Click0301_1', 'Click0301_2',
       'Click0301_3', 'Click0301_4', 'Click0301_5', 'Clicks0118', 'Clicks0123',
       'Clicks0212', 'Clicks0301', 'Delivery Status_0118',
       'Delivery Status_0123', 'Delivery Status_0212', 'Delivery Status_0301',
       'Failure Message_0118', 'Failure Message_0123', 'Failure Message_0212',
       'Failure Message_0301', 'Opens0118', 'Opens0123', 'Opens0212',
       'Opens0301', 'in_0123', 'in_0212', 'in_0301', 'treatment_real',
       'Click_ess', 'Open_ess', 'JAN07THENROLLMENT',
       'FLATAMOUNTBEFORETAX_0107', 'FLATAMOUNTAFTERTAX_0107',
       'PERCENTAGEBEFORETAX_

In [27]:
# First, remove columns that could potentially be senstive
cols_to_drop = ['Grade', 'Position Title', 'Annual Rt']

for df in dfs:
    if df.columns.isin(cols_to_drop).any():
        df.drop(columns = cols_to_drop, inplace = True)

In [28]:
# For an extra layer of security, we round the calculated flat amounts to the nearest $25

feb_to_edit = ['0122', '0218']
mar_to_edit = ['0122', '0304']

for date in feb_to_edit: 
    feb_enrolled_baseline['ANNUAL_FLAT_AMT_' + date] = \
        np.where(feb_enrolled_baseline['PERCENTAGE_' + date] > 0, 
                25 * round((feb_enrolled_baseline['ANNUAL_FLAT_AMT_' + date])/25), 
                 feb_enrolled_baseline['ANNUAL_FLAT_AMT_' + date])

for date in mar_to_edit: 
    mar_enrolled_baseline['ANNUAL_FLAT_AMT_' + date] = \
        np.where(mar_enrolled_baseline['PERCENTAGE_' + date] > 0, 
                25 * round((mar_enrolled_baseline['ANNUAL_FLAT_AMT_' + date])/25), 
                mar_enrolled_baseline['ANNUAL_FLAT_AMT_' + date])

In [29]:
feb_enrolled_baseline.to_csv(os.path.join(DATA_DIR,'Feb_enrolled_at_baseline.csv'), index = False)
feb_not_enrolled_baseline.to_csv(os.path.join(DATA_DIR, 'Feb_not_enrolled_at_baseline.csv'), index = False)

mar_enrolled_baseline.to_csv(os.path.join(DATA_DIR, 'Mar_enrolled_at_baseline.csv'), index = False)
mar_not_enrolled_baseline.to_csv(os.path.join(DATA_DIR, 'Mar_not_enrolled_at_baseline.csv'), index = False)


# Overwrite original files if sensitive columns still in there
# confirmatory.to_csv(os.path.join(DATA_DIR, 'confirmatory_outcomes_public.csv'), index = False)
# exploratory.to_csv(os.path.join(DATA_DIR, 'exploratory_outcomes_public.csv'), index = False)