<p>Class:  USC Viterbi Data Analytics Bootcamp</p>
<p>Team:  Analyticus (aka Team 5)</p>
<p>Module:  merge_plot_national_10_years.py<p>
<p>Version:  March 31, 2018
<p>Input 1:  CDC Influenza-Like-Illness Json File containing ten years of data.</p>
<p>Input 2:  HHS Vaccinations Json File containing five years of data.</p>
<p>Output:  Merged inputs based on year (flu season) and week (normalized to HHS week).


In [1]:
# Import dependances.
import json
import pandas as pd

In [2]:
# Load pandas.Dataframe from cdc_national.json
df_cdc = pd.read_json('../data/cdc_national_10_years.json')

In [3]:
# Sort to aid in analysis.
df_cdc = df_cdc.sort_values(by=['YEAR', 'WEEK'])

In [4]:
# Rename columns to names common to both input files to facilitate the merge.
df_cdc = df_cdc.rename(columns={"FLU_PERCENT":"flu_percent", "ILITOTAL":"flu_cases", "WEEK":"week", "YEAR":"year"})

In [5]:
# Verify the CDC dataframe.
df_cdc.head()

Unnamed: 0,flu_percent,flu_cases,week,year
0,0.880203,5687,1,2008
1,0.922302,5959,2,2008
2,0.912551,5896,3,2008
3,1.003713,6485,4,2008
4,1.08141,6987,5,2008


In [6]:
# Verify the CDC dataframe.
df_cdc.tail()

Unnamed: 0,flu_percent,flu_cases,week,year
490,5.486395,52348,21,2017
491,3.676497,35079,22,2017
492,2.957736,28221,23,2017
493,2.541655,24251,24,2017
494,2.240652,21379,25,2017


In [7]:
# Load HHS dataframe.
df_hhs = pd.read_json('../data/hhs_national_10_years.json')

In [8]:
# Order by year and week to aid in analysis.
df_hhs = df_hhs.sort_values(by=['year', 'week'])

In [9]:
# Verify the HHS dataframe.
df_hhs.head()

Unnamed: 0,vac_pct_week,vac_percent,vaccinations,week,year
0,0.0,51.2392,26043833,43,2012
1,0.0009,51.2401,26043833,44,2012
2,0.0008,51.2409,26043833,45,2012
3,0.0006,51.2415,26043833,46,2012
4,0.0004,51.2419,26043833,47,2012


In [10]:
# Verify the HHS dataframe.
df_hhs.tail()

Unnamed: 0,vac_pct_week,vac_percent,vaccinations,week,year
238,0.2095,45.8187,31422333,21,2017
239,0.1009,45.9196,31422333,22,2017
240,0.0651,45.9847,31422333,23,2017
241,0.0026,45.9873,31422333,24,2017
242,0.0,45.9873,31422333,25,2017


In [11]:
# Merge the CDC and HHS dataframes on year and week.
df = pd.merge(df_cdc, df_hhs, how='outer', on=['year', 'week'])

In [12]:
# Sort by year and week to aid in analysis.
df = df.sort_values(by=['year', 'week'])

In [13]:
# Replace the null values with zero.
# This occurs when there is not an intersection of HHS or CDC.
df.fillna(0, inplace=True)

In [14]:
# Check the dataframe containing the merged data.
df.head()

Unnamed: 0,flu_percent,flu_cases,week,year,vac_pct_week,vac_percent,vaccinations
0,0.880203,5687,1,2008,0.0,0.0,0.0
1,0.922302,5959,2,2008,0.0,0.0,0.0
2,0.912551,5896,3,2008,0.0,0.0,0.0
3,1.003713,6485,4,2008,0.0,0.0,0.0
4,1.08141,6987,5,2008,0.0,0.0,0.0


In [15]:
# Check the datafrme containing the merged data.
df.tail()

Unnamed: 0,flu_percent,flu_cases,week,year,vac_pct_week,vac_percent,vaccinations
490,5.486395,52348,21,2017,0.2095,45.8187,31422333.0
491,3.676497,35079,22,2017,0.1009,45.9196,31422333.0
492,2.957736,28221,23,2017,0.0651,45.9847,31422333.0
493,2.541655,24251,24,2017,0.0026,45.9873,31422333.0
494,2.240652,21379,25,2017,0.0,45.9873,31422333.0


In [16]:
# Write the merged data to a json file.
df.to_json('../data/plot_national_10_years.json')

In [17]:
# Read the json file of merged data for validation.
df_plot = pd.read_json('../data/plot_national_10_years.json')

In [18]:
# Sort the plot data to aid in analysis.
df_plot = df_plot.sort_values(['year', 'week'])

In [19]:
# Validate plot data.
df_plot.head()

Unnamed: 0,flu_cases,flu_percent,vac_pct_week,vac_percent,vaccinations,week,year
0,5687,0.880203,0.0,0.0,0,1,2008
1,5959,0.922302,0.0,0.0,0,2,2008
2,5896,0.912551,0.0,0.0,0,3,2008
3,6485,1.003713,0.0,0.0,0,4,2008
4,6987,1.08141,0.0,0.0,0,5,2008


In [20]:
# Validate plot data.
df_plot.tail()

Unnamed: 0,flu_cases,flu_percent,vac_pct_week,vac_percent,vaccinations,week,year
490,52348,5.486395,0.2095,45.8187,31422333,21,2017
491,35079,3.676497,0.1009,45.9196,31422333,22,2017
492,28221,2.957736,0.0651,45.9847,31422333,23,2017
493,24251,2.541655,0.0026,45.9873,31422333,24,2017
494,21379,2.240652,0.0,45.9873,31422333,25,2017
