# Health Statistics

The state data is sourced from CDC's COVID-19 Death Data and Resources database. The data link is the SODA API which is updated on a daily basis. The program filters the obtained data for US states, and focuses on mainly COVID-19 deaths. The County data is sourced from COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University. The data is updated on a daily basis.

In [1]:
# Change the directory
def change_dir(folder):
    get_path = sys.path[0].split("\\")      
    del get_path[-1]                                                   
    get_path.append(folder)                    
    path = "\\".join(get_path)              
    os.chdir(path) 

## State data

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
import json
import requests
import os
import sys
import urllib.request
sys.path.append("../")
from pandas.tseries.offsets import MonthEnd

In [3]:
def get_cdcdata(i):
    api_query = "https://data.cdc.gov/resource/r8kw-7aab.json"
    api_query = api_query + "?month=" + str(i)
    response = requests.get(api_query)
    formattedResponse = json.loads(response.text)
    return formattedResponse

In [4]:
appended_data = pd.DataFrame()
for i in range(1,13):
    df = pd.DataFrame(get_cdcdata(i))
    appended_data = pd.concat([appended_data, df])
appended_data

Unnamed: 0,data_as_of,start_date,end_date,group,year,month,state,covid_19_deaths,total_deaths,percent_of_expected_deaths,pneumonia_deaths,pneumonia_and_covid_19_deaths,influenza_deaths,pneumonia_influenza_or_covid_19_deaths,footnote
0,2021-07-30T00:00:00.000,2020-01-01T00:00:00.000,2020-01-31T00:00:00.000,By Month,2020,1,United States,6,264680,98.00,17909,3,2124,20036,
1,2021-07-30T00:00:00.000,2021-01-01T00:00:00.000,2021-01-31T00:00:00.000,By Month,2021,1,United States,104904,372603,138.00,69919,55477,143,119427,
2,2021-07-30T00:00:00.000,2020-01-01T00:00:00.000,2020-01-31T00:00:00.000,By Month,2020,1,Alabama,,4729,94.00,282,0,35,318,One or more data cells have counts between 1-9...
3,2021-07-30T00:00:00.000,2021-01-01T00:00:00.000,2021-01-31T00:00:00.000,By Month,2021,1,Alabama,2395,7783,154.00,1159,886,,2672,One or more data cells have counts between 1-9...
4,2021-07-30T00:00:00.000,2020-01-01T00:00:00.000,2020-01-31T00:00:00.000,By Month,2020,1,Alaska,0,422,107.00,10,0,,13,One or more data cells have counts between 1-9...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,2021-07-30T00:00:00.000,2020-12-01T00:00:00.000,2020-12-31T00:00:00.000,By Month,2020,12,Washington,937,6118,122.00,811,551,0,1197,
50,2021-07-30T00:00:00.000,2020-12-01T00:00:00.000,2020-12-31T00:00:00.000,By Month,2020,12,West Virginia,755,2993,147.00,483,344,0,894,
51,2021-07-30T00:00:00.000,2020-12-01T00:00:00.000,2020-12-31T00:00:00.000,By Month,2020,12,Wisconsin,1771,6654,140.0,790,593,,1968,One or more data cells have counts between 1-9...
52,2021-07-30T00:00:00.000,2020-12-01T00:00:00.000,2020-12-31T00:00:00.000,By Month,2020,12,Wyoming,183,611,147.00,113,83,0,213,


In [5]:
appended_data = appended_data.fillna(0)

In [6]:
# Drop irrelevant columns
appended_data.drop(appended_data.columns.difference(['year','month','state','covid_19_deaths']), 1, inplace=True)
appended_data

Unnamed: 0,year,month,state,covid_19_deaths
0,2020,1,United States,6
1,2021,1,United States,104904
2,2020,1,Alabama,0
3,2021,1,Alabama,2395
4,2020,1,Alaska,0
...,...,...,...,...
49,2020,12,Washington,937
50,2020,12,West Virginia,755
51,2020,12,Wisconsin,1771
52,2020,12,Wyoming,183


In [7]:
# Go to the Input folder
change_dir('input')  

In [8]:
# Merge state codes
state_codes = pd.read_csv("State_msa_names.csv")
merged = pd.merge(appended_data, state_codes, left_on='state', right_on='state_proper')
merged

Unnamed: 0,year,month,state,covid_19_deaths,state_abbr,fips_state,state_name,state_proper
0,2020,1,United States,6,US,111,UNITED STATES,United States
1,2021,1,United States,104904,US,111,UNITED STATES,United States
2,2020,2,United States,19,US,111,UNITED STATES,United States
3,2021,2,United States,47313,US,111,UNITED STATES,United States
4,2020,3,United States,7159,US,111,UNITED STATES,United States
...,...,...,...,...,...,...,...,...
983,2020,8,Puerto Rico,198,PR,72,PUERTO RICO,Puerto Rico
984,2020,9,Puerto Rico,233,PR,72,PUERTO RICO,Puerto Rico
985,2020,10,Puerto Rico,174,PR,72,PUERTO RICO,Puerto Rico
986,2020,11,Puerto Rico,342,PR,72,PUERTO RICO,Puerto Rico


In [9]:
# Save the output csv
change_dir('output')
merged.to_csv('health_stats_states.csv')

## County data - Deaths

In [10]:
# Read the dataset in csv format
data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')

In [11]:
data

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,7/20/21,7/21/21,7/22/21,7/23/21,7/24/21,7/25/21,7/26/21,7/27/21,7/28/21,7/29/21
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,113,113,113,113,113,113,113,114,114,114
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,324,324,324,324,325,325,325,326,328,328
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,60,60,60,60,61,61,61,61,61,61
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,65,65,65,65,65,65,65,65,65,65
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,139,139,139,139,139,139,139,139,139,139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,11,11,11,11,11,11,11,11,11,11
3338,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,13,13,13,13,13,13,13,14,14,14
3339,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
3340,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,26,26,26,26,26,26,26,26,26,26


In [12]:
# Transpose the date to clean up date columns
data_transpose = data.transpose().reset_index()

In [13]:
# Convert dates to a datetime format
data_transpose['index'][12:] = pd.to_datetime(data_transpose['index'][12:], infer_datetime_format=True).dt.date
dates = data_transpose.loc[12:]         # Save the dataframe as a separate dataframe
dates

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,3332,3333,3334,3335,3336,3337,3338,3339,3340,3341
12,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,2020-01-26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,2021-07-25,113,325,61,65,139,42,72,332,125,...,33,14,31,7,43,11,13,0,26,6
563,2021-07-26,113,325,61,65,139,42,72,332,125,...,33,14,31,7,43,11,13,0,26,6
564,2021-07-27,114,326,61,65,139,42,72,332,125,...,34,14,31,8,44,11,14,0,26,6
565,2021-07-28,114,328,61,65,139,42,72,332,125,...,34,14,31,8,44,11,14,0,26,6


In [14]:
# Find the end date of the month for every date
dates['End Date'] = pd.to_datetime(dates['index'], infer_datetime_format=True) + MonthEnd(0)
dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dates['End Date'] = pd.to_datetime(dates['index'], infer_datetime_format=True) + MonthEnd(0)


Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,3333,3334,3335,3336,3337,3338,3339,3340,3341,End Date
12,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-01-31
13,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-01-31
14,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-01-31
15,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-01-31
16,2020-01-26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-01-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,2021-07-25,113,325,61,65,139,42,72,332,125,...,14,31,7,43,11,13,0,26,6,2021-07-31
563,2021-07-26,113,325,61,65,139,42,72,332,125,...,14,31,7,43,11,13,0,26,6,2021-07-31
564,2021-07-27,114,326,61,65,139,42,72,332,125,...,14,31,8,44,11,14,0,26,6,2021-07-31
565,2021-07-28,114,328,61,65,139,42,72,332,125,...,14,31,8,44,11,14,0,26,6,2021-07-31


In [15]:
# Filter rows for the last date of the month
dates = dates[dates['index'] == dates['End Date']]
dates.head(20)

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,3333,3334,3335,3336,3337,3338,3339,3340,3341,End Date
21,2020-01-31,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-01-31
50,2020-02-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2020-02-29
81,2020-03-31,0,1,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,2020-03-31
111,2020-04-30,4,3,1,0,0,0,1,3,21,...,0,0,0,0,0,0,6,0,0,2020-04-30
142,2020-05-31,4,9,1,1,1,5,18,3,25,...,0,0,0,0,0,0,15,0,0,2020-05-31
172,2020-06-30,11,9,1,1,1,9,27,5,27,...,0,0,0,0,0,0,19,0,0,2020-06-30
203,2020-07-31,20,21,5,2,3,11,35,9,38,...,0,0,0,0,0,0,25,0,0,2020-07-31
234,2020-08-31,22,38,7,7,11,13,36,30,39,...,0,0,0,0,0,0,36,0,0,2020-08-31
264,2020-09-30,27,52,7,11,15,15,40,44,42,...,1,4,1,2,1,2,0,6,0,2020-09-30
295,2020-10-31,31,71,9,15,25,17,41,65,47,...,2,4,1,2,1,3,10,7,0,2020-10-31


In [16]:
# Delete the extra 'End Date' column
dates = dates.drop(['End Date'],axis=1)
dates

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,3332,3333,3334,3335,3336,3337,3338,3339,3340,3341
21,2020-01-31,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50,2020-02-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,2020-03-31,0,1,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
111,2020-04-30,4,3,1,0,0,0,1,3,21,...,0,0,0,0,0,0,0,6,0,0
142,2020-05-31,4,9,1,1,1,5,18,3,25,...,0,0,0,0,0,0,0,15,0,0
172,2020-06-30,11,9,1,1,1,9,27,5,27,...,0,0,0,0,0,0,0,19,0,0
203,2020-07-31,20,21,5,2,3,11,35,9,38,...,0,0,0,0,0,0,0,25,0,0
234,2020-08-31,22,38,7,7,11,13,36,30,39,...,0,0,0,0,0,0,0,36,0,0
264,2020-09-30,27,52,7,11,15,15,40,44,42,...,2,1,4,1,2,1,2,0,6,0
295,2020-10-31,31,71,9,15,25,17,41,65,47,...,3,2,4,1,2,1,3,10,7,0


In [17]:
# Retrieve the columns containing the other variables
variables = data_transpose.loc[1:11]
variables

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,3332,3333,3334,3335,3336,3337,3338,3339,3340,3341
1,iso2,US,US,US,US,US,US,US,US,US,...,US,US,US,US,US,US,US,US,US,US
2,iso3,USA,USA,USA,USA,USA,USA,USA,USA,USA,...,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA
3,code3,840,840,840,840,840,840,840,840,840,...,840,840,840,840,840,840,840,840,840,840
4,FIPS,1001.0,1003.0,1005.0,1007.0,1009.0,1011.0,1013.0,1015.0,1017.0,...,56029.0,56031.0,56033.0,56035.0,56037.0,56039.0,56041.0,90056.0,56043.0,56045.0
5,Admin2,Autauga,Baldwin,Barbour,Bibb,Blount,Bullock,Butler,Calhoun,Chambers,...,Park,Platte,Sheridan,Sublette,Sweetwater,Teton,Uinta,Unassigned,Washakie,Weston
6,Province_State,Alabama,Alabama,Alabama,Alabama,Alabama,Alabama,Alabama,Alabama,Alabama,...,Wyoming,Wyoming,Wyoming,Wyoming,Wyoming,Wyoming,Wyoming,Wyoming,Wyoming,Wyoming
7,Country_Region,US,US,US,US,US,US,US,US,US,...,US,US,US,US,US,US,US,US,US,US
8,Lat,32.539527,30.72775,31.868263,32.996421,33.982109,32.100305,31.753001,33.774837,32.913601,...,44.521575,42.132991,44.790489,42.765583,41.659439,43.935225,41.287818,0.0,43.904516,43.839612
9,Long_,-86.644082,-87.722071,-85.387129,-87.125115,-86.567906,-85.712655,-86.680575,-85.826304,-85.390727,...,-109.585283,-104.966331,-106.886239,-109.913092,-108.882788,-110.58908,-110.547578,0.0,-107.680187,-104.567488
10,Combined_Key,"Autauga, Alabama, US","Baldwin, Alabama, US","Barbour, Alabama, US","Bibb, Alabama, US","Blount, Alabama, US","Bullock, Alabama, US","Butler, Alabama, US","Calhoun, Alabama, US","Chambers, Alabama, US",...,"Park, Wyoming, US","Platte, Wyoming, US","Sheridan, Wyoming, US","Sublette, Wyoming, US","Sweetwater, Wyoming, US","Teton, Wyoming, US","Uinta, Wyoming, US","Unassigned, Wyoming, US","Washakie, Wyoming, US","Weston, Wyoming, US"


In [18]:
# Append the variables dataframe with the cleaned up dates dataframe
appended_df = variables.append(dates).transpose()
appended_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,264,295,325,356,387,415,446,476,507,537
index,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,...,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
0,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",...,27,31,42,48,69,91,99,107,110,113
1,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",...,52,71,98,161,224,283,301,305,311,315
2,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",...,7,9,11,32,40,51,55,56,59,60
3,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",...,11,15,17,46,52,60,58,63,64,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",...,1,1,2,4,6,9,9,9,9,11
3338,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",...,2,3,4,7,12,12,12,12,13,13
3339,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,"Unassigned, Wyoming, US",...,0,10,0,0,0,0,0,0,0,0
3340,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",...,6,7,8,19,25,26,26,26,26,26


In [19]:
# Clean up the headers
appended_df.columns = appended_df.iloc[0]
appended_df = appended_df.drop('index')
appended_df

index,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,...,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
0,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",...,27,31,42,48,69,91,99,107,110,113
1,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",...,52,71,98,161,224,283,301,305,311,315
2,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",...,7,9,11,32,40,51,55,56,59,60
3,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",...,11,15,17,46,52,60,58,63,64,64
4,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",...,15,25,40,63,100,127,131,135,139,139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",...,1,1,2,4,6,9,9,9,9,11
3338,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",...,2,3,4,7,12,12,12,12,13,13
3339,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,"Unassigned, Wyoming, US",...,0,10,0,0,0,0,0,0,0,0
3340,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",...,6,7,8,19,25,26,26,26,26,26


In [20]:
# States have unassigned county deaths - filter the actual counties in a separate dataframe
counties = appended_df[appended_df['Admin2'] != 'Unassigned']
counties_vars = counties[counties.columns[1:11]]

In [21]:
# Filter month columns for counties
counties_bymonth = counties[counties.columns[11:]]
counties_bymonth

index,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
0,0,0,0,4,4,11,20,22,27,31,42,48,69,91,99,107,110,113
1,0,0,1,3,9,9,21,38,52,71,98,161,224,283,301,305,311,315
2,0,0,0,1,1,1,5,7,7,9,11,32,40,51,55,56,59,60
3,0,0,0,0,1,1,2,7,11,15,17,46,52,60,58,63,64,64
4,0,0,0,0,1,1,3,11,15,25,40,63,100,127,131,135,139,139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,0,0,0,0,0,0,0,0,2,2,6,16,32,34,37,37,39,40
3337,0,0,0,0,0,0,0,0,1,1,2,4,6,9,9,9,9,11
3338,0,0,0,0,0,0,0,0,2,3,4,7,12,12,12,12,13,13
3340,0,0,0,0,0,0,0,0,6,7,8,19,25,26,26,26,26,26


In [22]:
# Calcuate the difference between each consecutive month to get deaths per month
counties_bymonth = counties_bymonth.diff(axis=1)
counties_bymonth

index,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
0,,0,0,4,0,7,9,2,5,4,11,6,21,22,8,8,3,3
1,,0,1,2,6,0,12,17,14,19,27,63,63,59,18,4,6,4
2,,0,0,1,0,0,4,2,0,2,2,21,8,11,4,1,3,1
3,,0,0,0,1,0,1,5,4,4,2,29,6,8,-2,5,1,0
4,,0,0,0,1,0,2,8,4,10,15,23,37,27,4,4,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,,0,0,0,0,0,0,0,2,0,4,10,16,2,3,0,2,1
3337,,0,0,0,0,0,0,0,1,0,1,2,2,3,0,0,0,2
3338,,0,0,0,0,0,0,0,2,1,1,3,5,0,0,0,1,0
3340,,0,0,0,0,0,0,0,6,1,1,11,6,1,0,0,0,0


In [23]:
merged_counties = pd.concat([counties_vars, counties_bymonth], axis=1)
merged_counties

index,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population,...,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
0,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869,...,5,4,11,6,21,22,8,8,3,3
1,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",223234,...,14,19,27,63,63,59,18,4,6,4
2,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",24686,...,0,2,2,21,8,11,4,1,3,1
3,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",22394,...,4,4,2,29,6,8,-2,5,1,0
4,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",57826,...,4,10,15,23,37,27,4,4,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3336,USA,840,56037.0,Sweetwater,Wyoming,US,41.659439,-108.882788,"Sweetwater, Wyoming, US",42343,...,2,0,4,10,16,2,3,0,2,1
3337,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.58908,"Teton, Wyoming, US",23464,...,1,0,1,2,2,3,0,0,0,2
3338,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,"Uinta, Wyoming, US",20226,...,2,1,1,3,5,0,0,0,1,0
3340,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,"Washakie, Wyoming, US",7805,...,6,1,1,11,6,1,0,0,0,0


In [24]:
# Filter the unassigned counties and merge it with the counties dataset
unassigned_counties = appended_df[appended_df['Admin2'] == 'Unassigned']
merged_all_counties = merged_counties.append(unassigned_counties)
merged_all_counties

index,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population,...,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30,iso2
0,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869,...,4,11,6,21,22,8,8,3,3,
1,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",223234,...,19,27,63,63,59,18,4,6,4,
2,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",24686,...,2,2,21,8,11,4,1,3,1,
3,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",22394,...,4,2,29,6,8,-2,5,1,0,
4,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",57826,...,10,15,23,37,27,4,4,4,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3134,USA,840,90051.0,Unassigned,Virginia,US,0.0,0.0,"Unassigned, Virginia, US",0,...,0,0,0,0,0,0,0,0,0,US
3180,USA,840,90053.0,Unassigned,Washington,US,0.0,0.0,"Unassigned, Washington, US",0,...,3,3,5,4,4,4,3,3,3,US
3235,USA,840,90054.0,Unassigned,West Virginia,US,0.0,0.0,"Unassigned, West Virginia, US",0,...,0,0,0,0,0,0,0,0,0,US
3306,USA,840,90055.0,Unassigned,Wisconsin,US,0.0,0.0,"Unassigned, Wisconsin, US",0,...,0,0,0,0,0,0,0,0,0,US


In [25]:
# Drop irrelevant columns
merged_all_counties.drop((['iso2','iso3','code3','Lat','Long_']), 1, inplace=True)
merged_all_counties

index,FIPS,Admin2,Province_State,Country_Region,Combined_Key,Population,2020-01-31,2020-02-29,2020-03-31,2020-04-30,...,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
0,1001.0,Autauga,Alabama,US,"Autauga, Alabama, US",55869,,0,0,4,...,5,4,11,6,21,22,8,8,3,3
1,1003.0,Baldwin,Alabama,US,"Baldwin, Alabama, US",223234,,0,1,2,...,14,19,27,63,63,59,18,4,6,4
2,1005.0,Barbour,Alabama,US,"Barbour, Alabama, US",24686,,0,0,1,...,0,2,2,21,8,11,4,1,3,1
3,1007.0,Bibb,Alabama,US,"Bibb, Alabama, US",22394,,0,0,0,...,4,4,2,29,6,8,-2,5,1,0
4,1009.0,Blount,Alabama,US,"Blount, Alabama, US",57826,,0,0,0,...,4,10,15,23,37,27,4,4,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3134,90051.0,Unassigned,Virginia,US,"Unassigned, Virginia, US",0,0,0,12,0,...,0,0,0,0,0,0,0,0,0,0
3180,90053.0,Unassigned,Washington,US,"Unassigned, Washington, US",0,0,0,0,0,...,2,3,3,5,4,4,4,3,3,3
3235,90054.0,Unassigned,West Virginia,US,"Unassigned, West Virginia, US",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3306,90055.0,Unassigned,Wisconsin,US,"Unassigned, Wisconsin, US",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Convert the output to csv
change_dir('output')
merged_all_counties.to_csv("health_stats_county.csv")

## County data - Vaccinations

In [27]:
# CDC Vaccination API
change_dir('input')
url = 'https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD'
urllib.request.urlretrieve(url, 'vaccinations.csv')
vaccines = pd.read_csv('vaccinations.csv')
vaccines

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Series_Complete_Pop_Pct,Series_Complete_Yes,Series_Complete_12Plus,Series_Complete_12PlusPop_Pct,Series_Complete_18Plus,...,Administered_Dose1_Recip_12PlusPop_Pct,Administered_Dose1_Recip_18Plus,Administered_Dose1_Recip_18PlusPop_Pct,Administered_Dose1_Recip_65Plus,Administered_Dose1_Recip_65PlusPop_Pct,SVI_CTGY,Series_Complete_Pop_Pct_SVI,Series_Complete_12PlusPop_Pct_SVI,Series_Complete_18PlusPop_Pct_SVI,Series_Complete_65PlusPop_Pct_SVI
0,07/30/2021,26047,30,Emmet County,MI,61.7,20610,20608,69.9,19646,...,0.0,,0.0,,0.0,Low,High VC/Low SVI,High VC/Low SVI,High VC/Low SVI,High VC/Low SVI
1,07/30/2021,37065,30,Edgecombe County,NC,34.4,17703,17701,40.3,17329,...,46.3,19814.0,49.7,7717.0,73.4,High,Low-Mod VC/High SVI,Mod-High VC/High SVI,Mod-High VC/High SVI,Mod-High VC/High SVI
2,07/30/2021,40079,30,Le Flore County,OK,27.7,13820,13820,32.8,13546,...,38.0,15575.0,41.1,6205.0,68.2,High,Low VC/High SVI,Low-Mod VC/High SVI,Low-Mod VC/High SVI,Mod-High VC/High SVI
3,07/30/2021,42119,30,Union County,PA,47.0,21132,21130,53.0,20356,...,60.3,23089.0,62.4,7598.0,91.1,Low-Mod,Mod-High VC/Low-Mod SVI,High VC/Low-Mod SVI,High VC/Low-Mod SVI,High VC/Low-Mod SVI
4,07/30/2021,49045,30,Tooele County,UT,39.7,28719,28717,50.2,26518,...,58.7,30612.0,62.4,6204.0,91.9,Low-Mod,Low-Mod VC/Low-Mod SVI,High VC/Low-Mod SVI,High VC/Low-Mod SVI,High VC/Low-Mod SVI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754613,12/13/2020,2282,51,Yakutat City and Borough,AK,0.0,0,0,0.0,0,...,0.0,,0.0,,0.0,Mod-High,,,,
754614,12/13/2020,48345,51,Motley County,TX,0.0,0,0,0.0,0,...,0.0,,0.0,,0.0,Mod-High,,,,
754615,12/13/2020,48087,51,Collingsworth County,TX,0.0,0,0,0.0,0,...,0.0,,0.0,,0.0,High,,,,
754616,12/13/2020,5141,51,Van Buren County,AR,0.0,0,0,0.0,0,...,0.0,,0.0,,0.0,Mod-High,,,,


In [28]:
vaccines.columns

Index(['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State',
       'Series_Complete_Pop_Pct', 'Series_Complete_Yes',
       'Series_Complete_12Plus', 'Series_Complete_12PlusPop_Pct',
       'Series_Complete_18Plus', 'Series_Complete_18PlusPop_Pct',
       'Series_Complete_65Plus', 'Series_Complete_65PlusPop_Pct',
       'Completeness_pct', 'Administered_Dose1_Recip',
       'Administered_Dose1_Pop_Pct', 'Administered_Dose1_Recip_12Plus',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18Plus',
       'Administered_Dose1_Recip_18PlusPop_Pct',
       'Administered_Dose1_Recip_65Plus',
       'Administered_Dose1_Recip_65PlusPop_Pct', 'SVI_CTGY',
       'Series_Complete_Pop_Pct_SVI', 'Series_Complete_12PlusPop_Pct_SVI',
       'Series_Complete_18PlusPop_Pct_SVI',
       'Series_Complete_65PlusPop_Pct_SVI'],
      dtype='object')

In [29]:
# Drop irrelevant columns
vaccines.drop(vaccines.columns.difference(['Date','FIPS','Recip_County','Recip_State','Series_Complete_Pop_Pct']), 1, inplace=True)
vaccines

Unnamed: 0,Date,FIPS,Recip_County,Recip_State,Series_Complete_Pop_Pct
0,07/30/2021,26047,Emmet County,MI,61.7
1,07/30/2021,37065,Edgecombe County,NC,34.4
2,07/30/2021,40079,Le Flore County,OK,27.7
3,07/30/2021,42119,Union County,PA,47.0
4,07/30/2021,49045,Tooele County,UT,39.7
...,...,...,...,...,...
754613,12/13/2020,2282,Yakutat City and Borough,AK,0.0
754614,12/13/2020,48345,Motley County,TX,0.0
754615,12/13/2020,48087,Collingsworth County,TX,0.0
754616,12/13/2020,5141,Van Buren County,AR,0.0


In [30]:
vaccines['Date'] = pd.to_datetime(vaccines['Date'], infer_datetime_format=True)

In [31]:
# Find the end date of the month for every date
vaccines['End Date'] = pd.to_datetime(vaccines['Date'], infer_datetime_format=True) + MonthEnd(0)
vaccines

Unnamed: 0,Date,FIPS,Recip_County,Recip_State,Series_Complete_Pop_Pct,End Date
0,2021-07-30,26047,Emmet County,MI,61.7,2021-07-31
1,2021-07-30,37065,Edgecombe County,NC,34.4,2021-07-31
2,2021-07-30,40079,Le Flore County,OK,27.7,2021-07-31
3,2021-07-30,42119,Union County,PA,47.0,2021-07-31
4,2021-07-30,49045,Tooele County,UT,39.7,2021-07-31
...,...,...,...,...,...,...
754613,2020-12-13,2282,Yakutat City and Borough,AK,0.0,2020-12-31
754614,2020-12-13,48345,Motley County,TX,0.0,2020-12-31
754615,2020-12-13,48087,Collingsworth County,TX,0.0,2020-12-31
754616,2020-12-13,5141,Van Buren County,AR,0.0,2020-12-31


In [32]:
# Filter rows for the last date of the month
vaccines = vaccines[vaccines['Date'] == vaccines['End Date']]
vaccines

Unnamed: 0,Date,FIPS,Recip_County,Recip_State,Series_Complete_Pop_Pct,End Date
98460,2021-06-30,31135,Perkins County,NE,29.9,2021-06-30
98461,2021-06-30,32009,Esmeralda County,NV,31.5,2021-06-30
98462,2021-06-30,51101,King William County,VA,27.2,2021-06-30
98463,2021-06-30,37135,Orange County,NC,58.7,2021-06-30
98464,2021-06-30,40063,Hughes County,OK,30.6,2021-06-30
...,...,...,...,...,...,...
695779,2020-12-31,30065,Musselshell County,MT,0.0,2020-12-31
695780,2020-12-31,21179,Nelson County,KY,0.0,2020-12-31
695781,2020-12-31,18101,Martin County,IN,0.0,2020-12-31
695782,2020-12-31,30025,Fallon County,MT,0.0,2020-12-31


In [33]:
vaccines['Date'].unique()

array(['2021-06-30T00:00:00.000000000', '2021-05-31T00:00:00.000000000',
       '2021-04-30T00:00:00.000000000', '2021-03-31T00:00:00.000000000',
       '2021-02-28T00:00:00.000000000', '2021-01-31T00:00:00.000000000',
       '2020-12-31T00:00:00.000000000'], dtype='datetime64[ns]')

In [34]:
# Parse months as columns 
vaccines = vaccines.set_index(['FIPS','Recip_County','Recip_State','Date'])['Series_Complete_Pop_Pct'].unstack()
vaccines

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
FIPS,Recip_County,Recip_State,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01001,Autauga County,AL,0.0,0.8,3.4,9.1,17.2,21.9,24.7
01003,Baldwin County,AL,0.0,1.3,7.3,14.5,22.1,26.4,30.1
01005,Barbour County,AL,0.0,0.5,4.2,10.5,16.3,20.0,24.7
01007,Bibb County,AL,0.0,0.8,5.9,9.6,14.5,17.4,21.0
01009,Blount County,AL,0.0,0.6,4.3,7.5,12.8,16.3,18.4
...,...,...,...,...,...,...,...,...,...
UNK,Unknown County,VT,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNK,Unknown County,WA,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNK,Unknown County,WI,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNK,Unknown County,WV,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Calcuate the difference between each consecutive month to get deaths per month
vaccines = vaccines.diff(axis=1)
vaccines

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30
FIPS,Recip_County,Recip_State,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01001,Autauga County,AL,,0.8,2.6,5.7,8.1,4.7,2.8
01003,Baldwin County,AL,,1.3,6.0,7.2,7.6,4.3,3.7
01005,Barbour County,AL,,0.5,3.7,6.3,5.8,3.7,4.7
01007,Bibb County,AL,,0.8,5.1,3.7,4.9,2.9,3.6
01009,Blount County,AL,,0.6,3.7,3.2,5.3,3.5,2.1
...,...,...,...,...,...,...,...,...,...
UNK,Unknown County,VT,,0.0,0.0,0.0,0.0,0.0,0.0
UNK,Unknown County,WA,,0.0,0.0,0.0,0.0,0.0,0.0
UNK,Unknown County,WI,,0.0,0.0,0.0,0.0,0.0,0.0
UNK,Unknown County,WV,,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Convert the output to csv - keep the cumulative vaccines
change_dir('output')
vaccines.to_csv("vaccines.csv")