In [1]:
import pandas as pd

## SES data

In [2]:
pe = pd.read_csv('PovertyEstimates.csv', thousands=',').rename(columns={'FIPStxt': 'fips'})
pe['fips'] = pe['fips'].apply(lambda x: str(x).zfill(5))
pe = pe.set_index('fips')[['POVALL_2018', 'PCTPOVALL_2018', 'MEDHHINC_2018']]
pe.shape

(3193, 3)

## Rural data

In [3]:
df_rural = pd.read_csv('County_Rural_Lookup.csv', thousands=',', usecols=[0, 7])
df_rural = df_rural.rename(columns={'2015 GEOID': 'fips', '2010 Census \nPercent Rural': 'perc_rural_pop'})
df_rural['perc_urban_pop'] = 100 - df_rural['perc_rural_pop']
df_rural = df_rural.set_index('fips').dropna()
print(df_rural.shape)

(3142, 2)


## Demographic data

In [4]:
dm_raw = pd.read_csv('cc-est2019-alldata.csv', encoding='ISO-8859-1', dtype={'STATE': str, 'COUNTY': str})
dm_raw = dm_raw[dm_raw.YEAR==12]
dm_raw['fips'] = dm_raw.STATE + dm_raw.COUNTY
dm_raw = dm_raw.set_index('fips')

In [5]:
dm_raw['minority'] = dm_raw.TOT_POP - (dm_raw.WA_MALE + dm_raw.WA_FEMALE)
dm_raw['black'] = dm_raw.BA_MALE + dm_raw.BA_FEMALE
dm_raw['hispanic'] = dm_raw.H_MALE + dm_raw.H_FEMALE
dm_all = dm_raw[dm_raw.AGEGRP == 0][['minority', 'black', 'hispanic', 'TOT_POP']]

In [6]:
dm_old = dm_raw[['AGEGRP', 'TOT_POP']][dm_raw.AGEGRP >= 14].reset_index().groupby('fips').sum()
dm_old = dm_old.rename(columns={'TOT_POP': '65yrs'}).drop(['AGEGRP'], axis=1)

In [7]:
dm = dm_old.join(dm_all)

In [8]:
columns = dm.columns
for c in ['65yrs', 'minority', 'black', 'hispanic']:
    pc = 'perc_' + c
    dm[pc] = dm[c] / dm.TOT_POP

### Sanity check for demographic dataframe (dm)
passed

In [9]:
summation = dm.sum(axis=0)
perc_black = summation['black'] / summation['TOT_POP']
print(f'black percentage = {100 * perc_black:.1f}%')
perc_hispanic = summation['hispanic'] / summation['TOT_POP']
print(f'hispanic percentage = {100 * perc_hispanic:.1f}%')
perc_minority = summation['minority'] / summation['TOT_POP']
print(f'minority percentage = {100 * perc_minority:.1f}%')
print(f"population over 65yrs = {summation['65yrs']}")

black percentage = 13.4%
hispanic percentage = 18.5%
minority percentage = 23.7%
population over 65yrs = 54058263.0


## Risk data

In [10]:
rf = pd.read_csv('county_pop_risk_covid.csv', dtype={'county': str})
rf = rf.rename(columns={'county': 'fips'}).set_index('fips')
rf.fillna(rf.mean(), inplace=True)
print(rf.shape)

(3094, 15)


## Combine demographic, SES, rural, risk

In [11]:
risk_cols = ['risk_flu']
df = dm.join(pe, how='inner')\
    .join(rf[risk_cols], how='inner')\
    .join(df_rural, how='inner')\
    .rename(columns={
        'POVALL_2018': 'poverty',
        'PCTPOVALL_2018': 'perc_poverty', 
        'MEDHHINC_2018': 'income', 
        'TOT_POP': 'population'})
print(df.columns)

Index(['65yrs', 'minority', 'black', 'hispanic', 'population', 'perc_65yrs',
       'perc_minority', 'perc_black', 'perc_hispanic', 'poverty',
       'perc_poverty', 'income', 'risk_flu', 'perc_rural_pop',
       'perc_urban_pop'],
      dtype='object')


In [13]:
print(df.shape)
df.to_csv('data_non-covid.csv', float_format='%.6f')

(3094, 15)
