<p>Class:  USC Viterbi Data Analytics Bootcamp</p>
<p>Team:  Analyticus (aka Team 5)</p>
<p>Module:  pull_hhs_national_data.py</p>
<p>Version:  March 31, 2018</p>
<p>Input:  HHS Vaccinations from HHS API for flu season 2017.</p>
<p>Output:  HHS Vaccinations Json File for flu season 2017.</p>

In [1]:
# Dependencies
import requests
import json
import numpy as np
import pandas as pd

In [2]:
# Load a dictionary from the HHS website. 
url = "https://fluvaccineapi.hhs.gov/api/v2/vaccination_rates/trends/2017/national.json?ethnicity=T&medicare_status=A"
    
national_dict = requests.get(url).json()

In [3]:
# Load a dataframe with HHS data from the dictionary.
df = pd.DataFrame(national_dict)

In [4]:
# Inspect the HHS dataframe.
df.head()

Unnamed: 0,count,disparity,ethnicity,fips,medicare_status,name,percentage,short_name,week,week_start,year
0,31422333,0.0,T,0,A,National,0.001697,US,1,05AUG,2017
1,31422333,0.0,T,0,A,National,0.005075,US,2,12AUG,2017
2,31422333,0.0,T,0,A,National,0.010479,US,3,19AUG,2017
3,31422333,0.0,T,0,A,National,0.020088,US,4,26AUG,2017
4,31422333,0.0,T,0,A,National,0.036383,US,5,02SEP,2017


In [5]:
# Select only columns count, percentage, and week.
df = df[['count', 'percentage', 'week']]

In [6]:
# Rename columns to match CDC data.
df = df.rename(columns={"count":"vaccinations", "percentage":"vac_percent"})

In [7]:
# Create another dataframe.
# This is a checkpoint for restarting the process.
df2 = df

In [8]:
# Inspect the HHS dataframe.
df2.head()

Unnamed: 0,vaccinations,vac_percent,week
0,31422333,0.001697,1
1,31422333,0.005075,2
2,31422333,0.010479,3
3,31422333,0.020088,4
4,31422333,0.036383,5


In [9]:
# Convert the vacination percentage from a ratio to a percentage.
df2.vac_percent = df2.vac_percent * 100

In [10]:
# Inspect the dataframe for the change.
df2.head()

Unnamed: 0,vaccinations,vac_percent,week
0,31422333,0.169688,1
1,31422333,0.507467,2
2,31422333,1.04786,3
3,31422333,2.00878,4
4,31422333,3.63828,5


In [11]:
# Add the "vaccination percentage per week" attribute.
df2['vac_pct_week'] = 0.0

In [12]:
# Calculate the vaccination per week by getting the differences in the cumulative vaccine rate.
for index_entry in df2.index:
    if index_entry == 0:
        df2.loc[index_entry, 'vac_pct_week'] = df2.loc[index_entry, 'vac_percent']
        continue
    i = index_entry - 1
    df2.loc[index_entry, 'vac_pct_week'] = df2.loc[index_entry, 'vac_percent'] - df2.loc[i, 'vac_percent']

In [13]:
# Inspect the HHS dataframe for the calculation.
df2.head()

Unnamed: 0,vaccinations,vac_percent,week,vac_pct_week
0,31422333,0.169688,1,0.169688
1,31422333,0.507467,2,0.337779
2,31422333,1.04786,3,0.540393
3,31422333,2.00878,4,0.96092
4,31422333,3.63828,5,1.6295


In [14]:
# Write the HHS national data to a json file.
df2.to_json('../data/hhs_national.json')

In [16]:
# Start the validation process by loading the above file into a dataframe.
df3 = pd.read_json('../data/hhs_national.json')

In [18]:
# Sort the HHS dataframe by week to aid in inspection.
df4 = df3.sort_values(['week'])

In [19]:
# Inspect HHS data.
df4.head()

Unnamed: 0,vac_pct_week,vac_percent,vaccinations,week
0,0.169688,0.169688,31422333,1
1,0.337779,0.507467,31422333,2
2,0.540393,1.04786,31422333,3
3,0.96092,2.00878,31422333,4
4,1.6295,3.63828,31422333,5


In [20]:
# Inspect HHS data.
df4.tail()

Unnamed: 0,vac_pct_week,vac_percent,vaccinations,week
20,0.2095,45.8187,31422333,21
21,0.1009,45.9196,31422333,22
22,0.0651,45.9847,31422333,23
23,0.0026,45.9873,31422333,24
24,0.0,45.9873,31422333,25
