In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

import pylab as plt
%matplotlib inline

import seaborn as sns

## SES data

In [2]:
pe = pd.read_csv('../data/PovertyEstimates.csv', thousands=',').rename(columns={'FIPStxt': 'fips'})
pe['fips'] = pe['fips'].apply(lambda x: str(x).zfill(5))
pe = pe.set_index('fips')[['POVALL_2018', 'PCTPOVALL_2018', 'MEDHHINC_2018']]
pe.shape

(3193, 3)

## Rural data

In [3]:
df_rural = pd.read_csv('../data/County_Rural_Lookup.csv', thousands=',', usecols=[0, 7])
df_rural = df_rural.rename(columns={'2015 GEOID': 'fips', '2010 Census \nPercent Rural': 'perc_rural_pop'})
df_rural['perc_urban_pop'] = 100 - df_rural['perc_rural_pop']
df_rural = df_rural.set_index('fips').dropna()
print(df_rural.shape)

(3142, 2)


## Demographic data

In [4]:
dm_raw = pd.read_csv('../data/cc-est2019-alldata.csv', encoding='ISO-8859-1', dtype={'STATE': str, 'COUNTY': str})
dm_raw = dm_raw[dm_raw.YEAR==12]
dm_raw['fips'] = dm_raw.STATE + dm_raw.COUNTY
dm_raw = dm_raw.set_index('fips')
dm_raw.head()

Unnamed: 0_level_0,SUMLEV,STATE,COUNTY,STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,...,HWAC_MALE,HWAC_FEMALE,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,50,1,1,Alabama,Autauga County,12,0,55869,27092,28777,...,778,687,89,93,40,27,15,19,16,11
1001,50,1,1,Alabama,Autauga County,12,1,3277,1713,1564,...,76,53,10,6,6,5,3,4,3,3
1001,50,1,1,Alabama,Autauga County,12,2,3465,1787,1678,...,83,59,2,10,8,2,2,0,1,1
1001,50,1,1,Alabama,Autauga County,12,3,3851,1977,1874,...,84,67,11,12,2,2,1,2,2,1
1001,50,1,1,Alabama,Autauga County,12,4,3659,1854,1805,...,55,68,7,6,4,5,0,4,3,0


In [5]:
dm_raw['minority'] = dm_raw.TOT_POP - (dm_raw.WA_MALE + dm_raw.WA_FEMALE)
dm_raw['black'] = dm_raw.BA_MALE + dm_raw.BA_FEMALE
dm_raw['hispanic'] = dm_raw.H_MALE + dm_raw.H_FEMALE
dm_all = dm_raw[dm_raw.AGEGRP == 0][['minority', 'black', 'hispanic', 'TOT_POP']]

In [6]:
dm_old = dm_raw[['AGEGRP', 'TOT_POP']][dm_raw.AGEGRP >= 14].reset_index().groupby('fips').sum()
dm_old = dm_old.rename(columns={'TOT_POP': '65yrs'}).drop(['AGEGRP'], axis=1)

In [7]:
dm = dm_old.join(dm_all)

In [8]:
columns = dm.columns
for c in ['65yrs', 'minority', 'black', 'hispanic']:
    pc = 'perc_' + c
    dm[pc] = dm[c] / dm.TOT_POP

### Sanity check for demographic dataframe (dm)
passed

In [9]:
summation = dm.sum(axis=0)
perc_black = summation['black'] / summation['TOT_POP']
print(f'black percentage = {100 * perc_black:.1f}%')
perc_hispanic = summation['hispanic'] / summation['TOT_POP']
print(f'hispanic percentage = {100 * perc_hispanic:.1f}%')
perc_minority = summation['minority'] / summation['TOT_POP']
print(f'minority percentage = {100 * perc_minority:.1f}%')
print(f"population over 65yrs = {summation['65yrs']}")

black percentage = 13.4%
hispanic percentage = 18.5%
minority percentage = 23.7%
population over 65yrs = 54058263.0


## Covid data

In [13]:
df_covid_raw = pd.read_csv('../data/time_series_covid19_death_US_2020-11-02.csv').dropna()
df_covid_raw['FIPS'] = df_covid_raw['FIPS'].apply(lambda x: str(int(x)).zfill(5))
df_covid_raw = df_covid_raw.rename(columns={'FIPS': 'fips'}).set_index('fips')
df_covid_raw = df_covid_raw.drop([
    'UID', 'iso2', 'iso3', 'code3', 
    'Admin2', 'Province_State', 'Country_Region', 
    'Lat', 'Long_', 'Combined_Key'], axis=1)

df_covid_raw['1/21/20'] = 0

### Get step sub-dataset

In [14]:
start = '4/4/20'
step = 7

columns = df_covid_raw.columns.values
start_idx = np.argwhere(columns==start)[0][0]

selected = columns[np.arange(start_idx, len(columns), step)]

foo = lambda x: '-'.join(x.split('/')[:2]) 

df_covid_step = df_covid_raw[selected].rename(columns={col: foo(col) for col in selected})
df_covid_step.head()

selected_ = [foo(col) for col in selected]
with open('steps.dat', 'w') as handle:
    handle.write(' '.join(selected_))

### get period prevalence

In [15]:
df_covid = df_covid_step[[foo(selected[0])]]
df_covid = df_covid.join(df_covid_step.diff(axis=1)[df_covid_step.columns[1:]])
# df_covid['covid'] = df_covid_raw['10/29/20']

## Risk data

In [16]:
rf = pd.read_csv('../data/county_pop_risk_covid.csv', dtype={'county': str})
rf = rf.rename(columns={'county': 'fips'}).set_index('fips')
rf.fillna(rf.mean(), inplace=True)
print(rf.shape)

(3094, 15)


## Combine demographic, SES, rural, risk, and covid dataframe

In [17]:
risk_cols = ['risk', 'risk_flu']
df = dm.join(pe, how='inner')\
    .join(rf[risk_cols], how='inner')\
    .join(df_rural, how='inner')\
    .join(df_covid, how='inner')\
    .rename(columns={
        'POVALL_2018': 'poverty',
        'PCTPOVALL_2018': 'perc_poverty', 
        'MEDHHINC_2018': 'income', 
        'TOT_POP': 'population'})
print(df.columns)

Index(['65yrs', 'minority', 'black', 'hispanic', 'population', 'perc_65yrs',
       'perc_minority', 'perc_black', 'perc_hispanic', 'poverty',
       'perc_poverty', 'income', 'risk', 'risk_flu', 'perc_rural_pop',
       'perc_urban_pop', '4-4', '4-11', '4-18', '4-25', '5-2', '5-9', '5-16',
       '5-23', '5-30', '6-6', '6-13', '6-20', '6-27', '7-4', '7-11', '7-18',
       '7-25', '8-1', '8-8', '8-15', '8-22', '8-29', '9-5', '9-12', '9-19',
       '9-26', '10-3', '10-10', '10-17', '10-24', '10-31'],
      dtype='object')


In [18]:
print(df.shape)
# df.to_csv(f'combined_data_confirmed_step-{step}.csv', float_format='%.6f')
df.to_csv(f'combined_data_death_step-{step}.csv', float_format='%.6f')

(3094, 47)
