In [171]:
import io
import boto3
import sqlalchemy
import pandas as pd
import mysql.connector
from datetime import datetime, timedelta
from collections import OrderedDict

#### Connect to S3 and download CSV files as Pandas Dataframe

In [265]:
def get_s3_keys(bucket):
    """Get a list of keys in an S3 bucket."""
    keys = []
    resp = s3.list_objects_v2(Bucket=bucket)
    for obj in resp['Contents']:
        keys.append(obj['Key'])
    return keys

In [266]:
s3 = boto3.client('s3')

In [267]:
bucket = 'cms-data-ze'

In [268]:
files = get_s3_keys(bucket)

In [269]:
dates = ["2016-01-01", "2019-09-02"]
start, end = [datetime.strptime(_, "%Y-%m-%d") for _ in dates]
file_months = list(OrderedDict(((start + timedelta(_)).strftime(r"%b-%y"), None) for _ in range((end - start).days)).keys())

In [270]:
file_months[0]

'Jan-16'

In [271]:
datetime.strptime(file_months[0], "%b-%y")

datetime.datetime(2016, 1, 1, 0, 0)

In [272]:
obj = s3.get_object(Bucket= bucket, Key= files[0]) 

In [273]:
df = pd.read_csv(obj['Body'], encoding='latin-1')
#initial_df = pd.read_csv(obj['Body'], skiprows=1) # 'Body' is a key word

#### Data preprocessing steps

#### 1. Fill all blank values with the column name + '_blank_' so we know that this value was missing 

In [104]:
[df[df.columns[i]].fillna(df.columns[i]+'_blank', inplace=True) for i, _ in enumerate(df.columns)]

[None, None, None, None, None, None, None, None, None, None, None, None]

#### 2. Create a unique ID which is 'Contract_ID' + 'Plan_ID' and call that column 'Unique_ID'

In [105]:
unique_id = df["Contract ID"].map(str) + '-' + df["Plan ID"].map(str)
df.insert(loc=0, column='Unique_ID', value=unique_id)

#### 3. Put it all together and append dataframe to master contract list

In [274]:
i = 0
rows = 0
appended_data = []
duplicate_data = []
for file in files:
    print('File name: ', file)
    obj = s3.get_object(Bucket= bucket, Key= file)
    df = pd.read_csv(obj['Body'], encoding='latin-1')
    df['Date'] = datetime.strptime(file_months[i], "%b-%y")
    i += 1
    print('Number of rows: ', df.shape[0])
    rows += df.shape[0]
    print('Filling NA values in columns')
    [df[df.columns[i]].fillna(df.columns[i]+'_blank', inplace=True) for i, _ in enumerate(df.columns)]
    print('Create Unique_ID column')
    unique_id = df["Contract ID"].map(str) + '-' + df["Plan ID"].map(str)
    df.insert(loc=0, column='Unique_ID', value=unique_id.str.split('.').str[0])
    print('Find the duplicated data: ', len(df['Unique_ID'])-len(df['Unique_ID'].drop_duplicates()))
    duplicate_data.append(df[df['Unique_ID'].duplicated()])
    appended_data.append(df)
    
appended_data = pd.concat(appended_data)
duplicate_data = pd.concat(duplicate_data)

print('appended_data size: ', appended_data.shape)
print('Counted number of rows: ', rows)

File name:  CPSC_Contract_Info_2016_01.csv
Number of rows:  4936
Filling NA values in columns
Create Unique_ID column
Find the duplicated data:  0
File name:  CPSC_Contract_Info_2016_02.csv
Number of rows:  4938
Filling NA values in columns
Create Unique_ID column
Find the duplicated data:  0
File name:  CPSC_Contract_Info_2016_03.csv
Number of rows:  4940
Filling NA values in columns
Create Unique_ID column
Find the duplicated data:  0
File name:  CPSC_Contract_Info_2016_04.csv
Number of rows:  4942
Filling NA values in columns
Create Unique_ID column
Find the duplicated data:  0
File name:  CPSC_Contract_Info_2016_05.csv
Number of rows:  4940
Filling NA values in columns
Create Unique_ID column
Find the duplicated data:  0
File name:  CPSC_Contract_Info_2016_06.csv
Number of rows:  4943
Filling NA values in columns
Create Unique_ID column
Find the duplicated data:  0
File name:  CPSC_Contract_Info_2016_07.csv
Number of rows:  4943
Filling NA values in columns
Create Unique_ID column


#### 4. Drop duplicates (keeping the latest entry for duplicate Unique_ID values) and keep the set of clean contract data

In [275]:
appended_data.columns

Index(['Unique_ID', 'Contract ID', 'Plan ID', 'Organization Type', 'Plan Type',
       'Offers Part D', 'SNP Plan', 'EGHP', 'Organization Name',
       'Organization Marketing Name', 'Plan Name', 'Parent Organization',
       'Contract Effective Date', 'Date'],
      dtype='object')

In [276]:
clean_data = appended_data.sort_values('Date').drop_duplicates('Unique_ID',keep='last')

In [277]:
clean_data.to_csv('clean_cms_contract_data.csv', index=False)

##### Find any duplicated values

In [223]:
print('Number of Duplicates: ', len(clean_data['Unique_ID'])-len(clean_data['Unique_ID'].drop_duplicates()))
print('Duplicated values: ', clean_data[clean_data['Unique_ID'].duplicated()])

Number of Duplicates:  0
Duplicated values:  Empty DataFrame
Columns: [Unique_ID, Contract ID, Plan ID, Organization Type, Plan Type, Offers Part D, SNP Plan, EGHP, Organization Name, Organization Marketing Name, Plan Name, Parent Organization, Contract Effective Date, Date]
Index: []


In [230]:
appended_data[appended_data['Unique_ID'] == 'E0654-801.0']

Unnamed: 0,Unique_ID,Contract ID,Plan ID,Organization Type,Plan Type,Offers Part D,SNP Plan,EGHP,Organization Name,Organization Marketing Name,Plan Name,Parent Organization,Contract Effective Date,Date
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-01-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-02-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-03-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-04-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-05-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-06-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-07-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-08-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-09-01
1,E0654-801.0,E0654,801,Employer/Union Only Direct Contract PDP,Employer/Union Only Direct Contract PDP,Yes,No,Yes,IBT VOLUNTARY EMPLOYEE BENEFITS TRUST,TEAMStar Medicare Part D Prescription Drug Pro...,IBT Voluntary Employee Benefits Trust (Employe...,IBT Voluntary Employee Benefits Trust,01/01/2007 0:00:00,2016-10-01


In [217]:
subset_data[subset_data['Unique_ID'] == 'H2354-801.0']

Unnamed: 0,Unique_ID,Contract ID,Plan ID,Plan Type,Parent Organization
1167,H2354-801.0,H2354,801,HMO/HMOPOS,HealthPlus of Michigan
1167,H2354-801.0,H2354,801,HMO/HMOPOS,Henry Ford Health System
