In [1]:
import re
import json
import boto3
import pandas as pd
#import mysql.connector
from collections import OrderedDict
from datetime import datetime, timedelta

##### Connect to S3 buckets

In [2]:
def get_s3_keys(bucket):
    """Get a list of keys in an S3 bucket."""
    keys = []
    resp = s3.list_objects_v2(Bucket=bucket)
    for obj in resp['Contents']:
        keys.append(obj['Key'])
    return keys

In [3]:
s3 = boto3.client('s3')

In [4]:
bucket = 'scc-enrollment-data'

In [5]:
files = get_s3_keys(bucket)

##### Import JSON file with unique contract_id/SSA Codes

In [45]:
with open('contract_id.json', 'r') as file:
    data = json.loads(file.read())

In [46]:
df = pd.DataFrame.from_dict(data, orient='index')
df.reset_index(level=0, inplace=True)
df.rename(columns={'index':'unique_id'}, inplace=True)
df['FIPS Code'] = df['FIPS Code'].fillna(99999).astype(int).round(0).astype(str)
df['FIPS Code'] = df['FIPS Code'].apply(lambda x: x.zfill(5))

In [47]:
master_df = df
master_df.head()

Unnamed: 0,unique_id,County,State,Contract ID,Organization,Type,Plan Type,SSA Code,FIPS Code
0,900911000,Autauga,AL,90091,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,HCPP - 1833 Cost,HCPP - 1833 Cost,1000,1001
1,9009110000,Alachua,FL,90091,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,HCPP - 1833 Cost,HCPP - 1833 Cost,10000,12001
2,9009110020,Bay,FL,90091,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,HCPP - 1833 Cost,HCPP - 1833 Cost,10020,12005
3,9009110040,Brevard,FL,90091,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,HCPP - 1833 Cost,HCPP - 1833 Cost,10040,12009
4,9009110050,Broward,FL,90091,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,HCPP - 1833 Cost,HCPP - 1833 Cost,10050,12011


In [48]:
for file in reversed(files):
    print(file)
    # Grab the date from the filename and format it correctly
    date_string = re.findall("SCC_Enrollment_MA_(.*?).csv", file)[0]
    date_formatted = date_string.split("_")[1] + "-01-" + date_string.split("_")[0]
    
    # Import the enrollment files
    obj = s3.get_object(Bucket= bucket, Key= file)
    df_enrollments = pd.read_csv(obj['Body'])
    
    # Perform Feature Engineering
    df_enrollments['Contract ID'] = df_enrollments['Contract ID'].astype(str)
    df_enrollments['SSA Code'] = df_enrollments['SSA Code'].astype(str)
    #df_enrollments['FIPS Code'] = df_enrollments['FIPS Code'].fillna(99999).astype(int).round(0).astype(str)
    #df_enrollments['FIPS Code'] = df_enrollments['FIPS Code'].apply(lambda x: x.zfill(5))
    df_enrollments['unique_id'] = df_enrollments['Contract ID'] + df_enrollments['SSA Code']
    df_enrollments.drop(['County', 'State', 'Contract ID', 'Organization Name', 'Organization Type', 'Plan Type', 'SSA Code', 'FIPS Code'], axis=1, inplace=True)
    
    # Replace '.', which signifies < 10 with 3 and convert the 'Enrolled' column to numeric
    if df_enrollments['Enrolled'].dtype == object:
        df_enrollments['Enrolled'] = df_enrollments['Enrolled'].str.replace('.','3').apply(pd.to_numeric)
    
    # Rename the 'Enrollment' column with the correct date for the file
    df_enrollments.rename(columns={'Enrolled': date_formatted}, inplace=True)
    
    # Merge the enrollment data with the master dataframe
    master_df = pd.merge(master_df, df_enrollments, left_on = 'unique_id', right_on = 'unique_id', how='left')
master_df.fillna(0, inplace=True)
master_df['FIPS Code'] = master_df['FIPS Code'].apply('="{}"'.format)

SCC_Enrollment_MA_2020_02.csv
SCC_Enrollment_MA_2020_01.csv
SCC_Enrollment_MA_2019_12.csv
SCC_Enrollment_MA_2019_11.csv
SCC_Enrollment_MA_2019_10.csv
SCC_Enrollment_MA_2019_09.csv
SCC_Enrollment_MA_2019_08.csv
SCC_Enrollment_MA_2019_07.csv
SCC_Enrollment_MA_2019_06.csv
SCC_Enrollment_MA_2019_05.csv
SCC_Enrollment_MA_2019_04.csv
SCC_Enrollment_MA_2019_03.csv
SCC_Enrollment_MA_2019_02.csv
SCC_Enrollment_MA_2019_01.csv
SCC_Enrollment_MA_2018_12.csv
SCC_Enrollment_MA_2018_11.csv
SCC_Enrollment_MA_2018_10.csv
SCC_Enrollment_MA_2018_09.csv
SCC_Enrollment_MA_2018_08.csv
SCC_Enrollment_MA_2018_07.csv
SCC_Enrollment_MA_2018_06.csv
SCC_Enrollment_MA_2018_05.csv
SCC_Enrollment_MA_2018_04.csv
SCC_Enrollment_MA_2018_03.csv
SCC_Enrollment_MA_2018_02.csv
SCC_Enrollment_MA_2018_01.csv
SCC_Enrollment_MA_2017_12.csv
SCC_Enrollment_MA_2017_11.csv


In [49]:
master_df.drop(['unique_id', 'County', 'Contract ID', 'Type', 'SSA Code', 'Plan Type'], axis=1, inplace=True)
master_df.rename(columns={'Organization': 'Parent_Organization', 'FIPS Code': 'FIPS'}, inplace=True)
#master_df['Parent_Organization'] = master_df['Parent_Organization'].apply(lambda x: ' '.join(str(x).split(" ")[:2]))
master_df['Parent_Organization'] = master_df['Parent_Organization'].apply(lambda x: PayerNames(str(x)))

In [50]:
cols = list(master_df)
cols.insert(1, cols.pop(cols.index('FIPS')))

In [51]:
master_df = master_df[cols]

In [52]:
master_df.head()

Unnamed: 0,State,FIPS,Parent_Organization,02-01-2020,01-01-2020,12-01-2019,11-01-2019,10-01-2019,09-01-2019,08-01-2019,...,08-01-2018,07-01-2018,06-01-2018,05-01-2018,04-01-2018,03-01-2018,02-01-2018,01-01-2018,12-01-2017,11-01-2017
0,AL,"=""01001""",UNITEDHEALTHCARE,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,FL,"=""12001""",UNITEDHEALTHCARE,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FL,"=""12005""",UNITEDHEALTHCARE,13.0,12.0,12.0,12.0,13.0,13.0,13.0,...,13.0,13.0,13.0,15.0,15.0,15.0,14.0,14.0,14.0,13.0
3,FL,"=""12009""",UNITEDHEALTHCARE,31.0,32.0,31.0,30.0,30.0,30.0,30.0,...,30.0,31.0,30.0,29.0,30.0,29.0,29.0,29.0,29.0,28.0
4,FL,"=""12011""",UNITEDHEALTHCARE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
master_df.to_csv("SCC_Enrollments_022020.csv", index = False)

##### Test Code

In [125]:
obj = s3.get_object(Bucket= bucket, Key= 'SCC_Enrollment_MA_2019_03.csv')

In [126]:
dataframe = pd.read_csv(obj['Body'])

In [127]:
dataframe['Contract ID'] = dataframe['Contract ID'].astype(str)
dataframe['SSA Code'] = dataframe['SSA Code'].astype(str)
dataframe['FIPS Code'] = dataframe['FIPS Code'].fillna(99999).astype(int).round(0).astype(str)
dataframe['FIPS Code'] = dataframe['FIPS Code'].apply(lambda x: x.zfill(5))
dataframe['unique_id'] = dataframe['Contract ID'] + dataframe['SSA Code']
dataframe.drop(['County', 'State', 'Contract ID', 'Organization Name', 'Organization Type', 'Plan Type', 'SSA Code', 'FIPS Code'], axis=1, inplace=True)

In [57]:
updated_df = pd.merge(df, dataframe, left_on = 'unique_id', right_on = 'unique_id', how='left')

In [128]:
if dataframe['Enrolled'].dtype == object:
    dataframe['Enrolled'].str.replace('.','3').apply(pd.to_numeric)
else:
    print('Pass')

In [120]:
dataframe.dtypes

County                object
State                 object
Contract ID           object
Organization Name     object
Organization Type     object
Plan Type             object
SSA Code               int64
FIPS Code            float64
Enrolled             float64
dtype: object

In [44]:
def PayerNames(name):
    if type(name) != str:
        return None
    
    if 'aetna' in name.lower():
        return 'AETNA'
    elif 'humana' in name.lower():
        return 'HUMANA'
    elif 'kaiser' in name.lower():
        return 'KAISER'
    elif 'united' in name.lower():
        return 'UNITEDHEALTHCARE'
    elif 'cigna' in name.lower():
        return 'CIGNA'
    elif 'brighthealth' in name.lower():
        return 'BRIGHTHEALTH'
    elif 'wellcare' in name.lower():
        return 'WELLCARE'
    elif 'pace' in name.lower():
        return 'PACE'
    elif 'molina' in name.lower():
        return 'MOLINA'
    elif 'longevity' in name.lower():
        return 'LONGEVITY'
    elif 'innovage' in name.lower():
        return 'INNOVAGE'
    elif 'healthspring' in name.lower():
        return 'HEALTHSPRING'
    elif 'coventry' in name.lower():
        return 'COVENTRY'
    elif 'bright health' in name.lower():
        return 'BRIGHT HEALTH'
    elif 'anthem' in name.lower():
        return 'ANTHEM'
    else:
        return name

In [34]:
PayerNames('BRIGHT HEALTH is the best company ever')

'Bright Health'