In [1]:
import os
import pandas as pd
import numpy as np
import time
import censusdata

from sodapy import Socrata
from urllib.request import Request, urlopen

In [2]:
#identifying the sources of our data
doh_domain = 'data.pa.gov'
zip_cases_ident = 'tsf6-pnaf'
county_cases_ident = 'j72v-r42c'
county_vax_ident = 'bicw-3gwi'
county_cume_vax_ident = 'gcnb-epac'
zip_cume_vax_ident = 'd63n-ygar'
socrata_token = os.environ.get("socrata_token")

In [3]:
#retrieving the date so we can include it in file names
datestr = time.strftime("%Y%m%d")

In [4]:
client = Socrata (doh_domain, socrata_token)



In [5]:
#retrieving ZIP Code cases
zip_cases_results = client.get(zip_cases_ident, where="postcode between '16001' and '16066'", limit=42, select="postcode, positive, negative")
#putting that in a dataframe
zip_cases_df = pd.DataFrame.from_dict(zip_cases_results)
zip_cases_df = zip_cases_df.dropna()
#setting file name
zip_cases_name = (datestr + "_zip_cases.csv")
#writing new file
zip_cases_df.to_csv(zip_cases_name, index=False)

In [6]:
#retrieving cumulative county cases
county_cases_results = client.get(county_cases_ident, 
                                  where="county = 'Butler'",
                                  select="date, county, cases, cases_rate, cases_avg_new, cases_avg_new_rate, cases_cume, cases_cume_rate",
                                  order="date DESC")
#putting that in a dataframe
county_cases_df = pd.DataFrame.from_dict(county_cases_results)
#setting file name
cases_name = (datestr + "_county_cases.csv")
#writing new file
county_cases_df.to_csv(cases_name, index=False)

In [7]:
#retrieving daily vaccination data
county_daily_vax = client.get(county_vax_ident,
                              where="county = 'Butler'",
                              order="date DESC")
#setting vaccine dataframe
county_vax_df = pd.DataFrame.from_dict(county_daily_vax)
#setting file name
new_vax_name = (datestr + "_county_daily_vax.csv")
#writing new file
county_vax_df.to_csv(new_vax_name, index=False)

In [8]:
#retrieving cumulative county vaccination data
county_cume_vax = client.get(county_cume_vax_ident,
                             where="county = 'Butler'")
#setting cumulative vaccine dataframe
county_cume_vax_df = pd.DataFrame.from_dict(county_cume_vax)
#setting file name
cume_vax_name = (datestr + "_county_cume_vax.csv")
#writing new file
county_cume_vax_df.to_csv(cume_vax_name, index=False)

In [9]:
#retrieving cumulative ZIP Code vaccination data
zip_cume_vax = client.get(zip_cume_vax_ident,
                          where="patient_zip_code between '16001' and '16066'",
                          order="patient_zip_code ASC")
#setting cumulative ZIP Code vax dataframe
zip_cume_vax_df = pd.DataFrame.from_dict(zip_cume_vax)
#setting file name
zip_vax_name = (datestr + '_zip_cume_vax.csv')
#writing to new file
zip_cume_vax_df.to_csv(zip_vax_name, index=False)

In [10]:
#denoting all Butler County Zip codes
r1 = 16001
r2 = 16066
ziplist = list(range(r1, r2+1))
#converting this to a string
zips = ','.join(str(e) for e in ziplist)

In [11]:
#retrieving population by Zip code
bczips = censusdata.download('acs5', 2019,
                             censusdata.censusgeo([('state', '42'), ('zip code tabulation area', zips)]),
                             ['B01001_001E'])

In [12]:
#hacky way of being able to sort by Zip
bczips.index = bczips.index.astype('str')
bczips.index = bczips.index.str[:11]
bczips.index = bczips.index.str[-5:]
bczips = bczips.reset_index()
bczips['B01001_001E'] = bczips['B01001_001E'].astype('int')

In [13]:
#adding n/a values to zip codes for which the Census doesn't have numbers
blankZips = [pd.Series([16003, None], index=bczips.columns),
             pd.Series([16018, None], index=bczips.columns),
             pd.Series([16039, None], index=bczips.columns),
             pd.Series([16058, None], index=bczips.columns)]
bczips = bczips.append(blankZips, ignore_index = True)
bczips = bczips.set_index('index')
bczips.index = bczips.index.astype('int')

In [14]:
#awful way of sorting that crap yet again
bczips = bczips.sort_index()
bczips = bczips.reset_index(drop = True)
#converting rows to int
zip_cases_df['positive'] = zip_cases_df['positive'].astype('int')

In [15]:
#putting it all together
zip_cases_df['Population'] = bczips['B01001_001E']
#dividing this crap to get incidence
zip_cases_df['Cumulative Incidence'] = (zip_cases_df['positive'] / zip_cases_df['Population'] * 100000)
zip_cases_df['Cumulative Incidence'] = zip_cases_df['Cumulative Incidence'].round(2)
zip_cases_df.to_csv(zip_cases_name, index=False)