In [740]:
import pandas as pd

## Prepare Median Income Data

In [741]:
income = pd.read_csv('./Data/median_income.csv')

In [742]:
income.head()

Unnamed: 0,county_name,median_income_2012_2016_16_dollars,median_income_2020_16_dollars
0,Anderson,42146,44871
1,Andrews,70121,69369
2,Angelina,44185,47917
3,Aransas,44851,49153
4,Archer,62407,59010


## Prepare unemployment data

In [743]:
unemp = pd.read_csv('./Data/unemployment_rate_by_county.csv')

In [744]:
unemp.head()

Unnamed: 0,county_id,county_name,2012,2016,2017,2018,2019,2020
0,1,Anderson,0.036,0.043,0.037,0.031,0.029,0.056
1,2,Andrews,0.031,0.047,0.031,0.024,0.023,0.082
2,3,Angelina,0.051,0.059,0.051,0.044,0.04,0.076
3,4,Aransas,0.068,0.056,0.069,0.06,0.043,0.085
4,5,Archer,0.034,0.043,0.034,0.031,0.028,0.053


In [745]:
unemp.drop(columns = ['2017', '2018', '2019', 'county_id'], inplace = True)

In [746]:
unemp['2012'] = unemp['2012'].apply(lambda x: round(x,6))
unemp['2016'] = unemp['2012'].apply(lambda x: round(x,6))
unemp['2020'] = unemp['2012'].apply(lambda x: round(x,6))

In [747]:
#we'll use the statewide unemployment data to create a column for the difference between a counties unemployment rate and the statewide rate
state_unemp = pd.read_csv('./Data/StatewideEnemployment.csv', index_col = 'DATE')

unemp_2012 = state_unemp.loc['2013-01-01']/100
unemp_2016 = state_unemp.loc['2017-01-01']/100
unemp_2020 = state_unemp.loc['2021-01-01']/100

unemp['2012_var_state'] = unemp['2012'].apply(lambda x: round(x-unemp_2012,6))
unemp['2016_var_state'] = unemp['2016'].apply(lambda x: round(x-unemp_2016,6))
unemp['2020_var_state'] = unemp['2020'].apply(lambda x: round(x-unemp_2020,6))

In [748]:
unemp.head()

Unnamed: 0,county_name,2012,2016,2020,2012_var_state,2016_var_state,2020_var_state
0,Anderson,0.036,0.036,0.036,-0.029,-0.012,-0.032
1,Andrews,0.031,0.031,0.031,-0.034,-0.017,-0.037
2,Angelina,0.051,0.051,0.051,-0.014,0.003,-0.017
3,Aransas,0.068,0.068,0.068,0.003,0.02,0.0
4,Archer,0.034,0.034,0.034,-0.031,-0.014,-0.034


merge the two dataframes together

In [749]:
df = unemp.merge(right = income, how = 'left', on = 'county_name')

In [750]:
df.head()

Unnamed: 0,county_name,2012,2016,2020,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars
0,Anderson,0.036,0.036,0.036,-0.029,-0.012,-0.032,42146,44871
1,Andrews,0.031,0.031,0.031,-0.034,-0.017,-0.037,70121,69369
2,Angelina,0.051,0.051,0.051,-0.014,0.003,-0.017,44185,47917
3,Aransas,0.068,0.068,0.068,0.003,0.02,0.0,44851,49153
4,Archer,0.034,0.034,0.034,-0.031,-0.014,-0.034,62407,59010


In [751]:
#check we still have 254 counties
df.shape

(254, 9)

## Prepare demographic data

In [752]:
demo_12 = pd.read_csv('./Data/2012_demo.csv')
demo_16 = pd.read_csv('./Data/2016_demo.csv')
demo_19 = pd.read_csv('./Data/2019_demo.csv')

In [753]:
def clean_demo_df(df, yr):
    df = df[df['Age'] == 'All Ages']
    df = df[df['County'] != 'STATE OF TEXAS']
    df.reset_index(inplace = True)
    df.drop(columns = ['Age', 'FIPS', 'index', 'County'], inplace = True)
    cols = df.columns
    mydict = {}
    for col in cols:
        mydict[col] = col + yr
    cols = df.columns.drop('Total')
    #totals = df['Total']
    for col in cols:
        df[col] = df[col]/df['Total']
        #df[col] = df[col].apply(lambda x: 100* (x/df['Total']))
    for col in cols:
        df[col] = df[col].apply(lambda x: round(x, 4))
    df = df.rename(columns=mydict)
    
    return df


demo_12 = clean_demo_df(demo_12, '12')
demo_16 = clean_demo_df(demo_16, '16')
demo_19 = clean_demo_df(demo_19, '19')


In [754]:
demo_12.shape

(254, 15)

In [755]:
df = df.merge(right = demo_12, how = 'left', left_index = True, right_index = True)
df = df.merge(right = demo_16, how = 'left', left_index = True, right_index = True)
df = df.merge(right = demo_19, how = 'left', left_index = True, right_index = True)

In [756]:
df.head()

Unnamed: 0,county_name,2012,2016,2020,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars,Total12,...,NH_Black_Female19,NH_Asian_Total19,NH_Asian_Male19,NH_Asian_Female19,NH_Other_Total19,NH_Other_Male19,NH_Other_Female19,Hispanic_Total19,Hispanic_Male19,Hispanic_Female19
0,Anderson,0.036,0.036,0.036,-0.029,-0.012,-0.032,42146,44871,58964,...,0.0548,0.0053,0.0025,0.0028,0.0233,0.0114,0.0119,0.1814,0.1208,0.0606
1,Andrews,0.031,0.031,0.031,-0.034,-0.017,-0.037,70121,69369,16039,...,0.0059,0.0044,0.0023,0.0021,0.0118,0.0059,0.006,0.6212,0.3217,0.2995
2,Angelina,0.051,0.051,0.051,-0.014,0.003,-0.017,44185,47917,88850,...,0.0791,0.0101,0.0047,0.0054,0.0164,0.0081,0.0083,0.2261,0.1153,0.1108
3,Aransas,0.068,0.068,0.068,0.003,0.02,0.0,44851,49153,23825,...,0.0045,0.0172,0.0084,0.0088,0.0193,0.0094,0.0099,0.2968,0.1564,0.1403
4,Archer,0.034,0.034,0.034,-0.031,-0.014,-0.034,62407,59010,8996,...,0.0026,0.0027,0.001,0.0017,0.0286,0.0115,0.0171,0.1046,0.0598,0.0448


In [757]:
df.shape

(254, 57)

In [758]:
df.to_csv('Jacks_df.csv')