<p>Class:  USC Viterbi Data Analytics Bootcamp</p>
<p>Team:  Analyticus (aka Team 5)</p>
<p>Module:  pull_cdc_national_10_years.py<p>
<p>Version:  March 31, 2018
<p>Input:  CDC Influenza-Like-Illness CSV File containing ten years of data.</p>
<p>Output:  CDC json file containing ten years of CDC data normalized to HHS flu season.</p>

In [2]:
# Import dependances.
import json
import csv
import pandas as pd

In [3]:
# Load CDC data into a dataframe.
df = pd.read_csv('../data/cdc_national_10_years.csv', skiprows=[0])

In [4]:
# Validate the CDC dataframe.
df.head()

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 25-49,AGE 25-64,AGE 5-24,AGE 50-64,AGE 65,ILITOTAL,NUM. OF PROVIDERS,TOTAL PATIENTS
0,National,X,2008,40,1.06828,1.02284,1535,X,1476,2352,X,324,5687,1435,555999
1,National,X,2008,41,1.00756,1.03017,1861,X,1485,2291,X,322,5959,1500,578446
2,National,X,2008,42,0.979399,1.01245,1869,X,1467,2219,X,341,5896,1525,582351
3,National,X,2008,43,1.04943,1.08359,1976,X,1668,2474,X,367,6485,1527,598473
4,National,X,2008,44,1.13218,1.16578,2119,X,1869,2547,X,452,6987,1545,599343


In [6]:
# Sort the data into year, week, total sequence to aid analysis.
df2 = df[['YEAR', 'WEEK', 'ILITOTAL']]

In [7]:
# Inspect the data.
df2.head()

Unnamed: 0,YEAR,WEEK,ILITOTAL
0,2008,40,5687
1,2008,41,5959
2,2008,42,5896
3,2008,43,6485
4,2008,44,6987


In [8]:
# Inspect the data.
df2.tail()

Unnamed: 0,YEAR,WEEK,ILITOTAL
490,2018,8,52348
491,2018,9,35079
492,2018,10,28221
493,2018,11,24251
494,2018,12,21379


In [9]:
# Normalize the CDC year and week to HHS year and week.
df3 = pd.DataFrame()
for df2_index in df2.index:
    if df2.loc[df2_index, 'WEEK'] > 39:
        df3.loc[df2_index, 'YEAR'] = df2.loc[df2_index, 'YEAR']
        df3.loc[df2_index, 'WEEK'] = df2.loc[df2_index, 'WEEK'] - 39
    else:
        df3.loc[df2_index, 'WEEK'] = df2.loc[df2_index, 'WEEK'] + 13
        df3.loc[df2_index, 'YEAR'] = df2.loc[df2_index, 'YEAR'] - 1
    df3.loc[df2_index, 'ILITOTAL'] = df2.loc[df2_index, 'ILITOTAL']
    

In [10]:
# Inspect the normalized data.
df3.head()

Unnamed: 0,YEAR,WEEK,ILITOTAL
0,2008.0,1.0,5687.0
1,2008.0,2.0,5959.0
2,2008.0,3.0,5896.0
3,2008.0,4.0,6485.0
4,2008.0,5.0,6987.0


In [11]:
# Sort the normalized data to aid analysis.
df4 = df3.sort_values(['YEAR', 'WEEK'])

In [12]:
# Inspect the data.
df4.head()

Unnamed: 0,YEAR,WEEK,ILITOTAL
0,2008.0,1.0,5687.0
1,2008.0,2.0,5959.0
2,2008.0,3.0,5896.0
3,2008.0,4.0,6485.0
4,2008.0,5.0,6987.0


In [13]:
# Inspect the data.
df4.tail()

Unnamed: 0,YEAR,WEEK,ILITOTAL
490,2017.0,21.0,52348.0
491,2017.0,22.0,35079.0
492,2017.0,23.0,28221.0
493,2017.0,24.0,24251.0
494,2017.0,25.0,21379.0


In [14]:
# Convert year and week from type float to type int.
df5 = df4.loc[:,['YEAR', 'WEEK','ILITOTAL']].astype(int)

In [15]:
# Inspect the year and week integers.
df5.head()

Unnamed: 0,YEAR,WEEK,ILITOTAL
0,2008,1,5687
1,2008,2,5959
2,2008,3,5896
3,2008,4,6485
4,2008,5,6987


In [17]:
# Calculate total flu cases by year.  
# The sum will be used for calculating percentages.
df6 = pd.DataFrame(df5.groupby('YEAR').agg({'ILITOTAL': 'sum'}))

In [18]:
# Inspect the cases sums by year.
df6.head()

Unnamed: 0_level_0,ILITOTAL
YEAR,Unnamed: 1_level_1
2008,646101
2009,824107
2010,644978
2011,515225
2012,728957


In [19]:
# Check the code needed to access the case sum.
df6.columns
df6.loc[2011, 'ILITOTAL']

515225

In [20]:
# Calculate the case percentage by dividing the week cases by the sum of cases for the year.
for df5_index in df5.index:
    df6_index = df5.loc[df5_index, 'YEAR']
    df5.loc[df5_index, 'FLU_PERCENT'] = (df5.loc[df5_index, 'ILITOTAL'] / df6.loc[df6_index, 'ILITOTAL']) * 100

In [21]:
# Verify that the case percentages for a year add to 100 percent.
pd.DataFrame(df5.groupby('YEAR').agg({'FLU_PERCENT': 'sum'}))

Unnamed: 0_level_0,FLU_PERCENT
YEAR,Unnamed: 1_level_1
2008,100.0
2009,100.0
2010,100.0
2011,100.0
2012,100.0
2013,100.0
2014,100.0
2015,100.0
2016,100.0
2017,100.0


In [22]:
# Write the data to a json file.
df5.to_json('../data/cdc_national_10_years.json')

In [24]:
# Load a dataframe with data from the just-written file for validation.
df7 = pd.read_json('../data/cdc_national_10_years.json')

In [26]:
# Sort the data to aid validation.
df8 = df7.sort_values(by=['YEAR', 'WEEK'])

In [27]:
# Validate CDC data.
df8.head()

Unnamed: 0,FLU_PERCENT,ILITOTAL,WEEK,YEAR
0,0.880203,5687,1,2008
1,0.922302,5959,2,2008
2,0.912551,5896,3,2008
3,1.003713,6485,4,2008
4,1.08141,6987,5,2008


In [28]:
# Validate CDC data.
df8.tail()

Unnamed: 0,FLU_PERCENT,ILITOTAL,WEEK,YEAR
490,5.486395,52348,21,2017
491,3.676497,35079,22,2017
492,2.957736,28221,23,2017
493,2.541655,24251,24,2017
494,2.240652,21379,25,2017
