In [84]:
import pandas as pd

## Prepare Median Income Data

In [85]:
income = pd.read_csv('./Data/median_income.csv')

In [86]:
income.head()

Unnamed: 0,county_name,median_income_2012_2016_16_dollars,median_income_2020_16_dollars
0,Anderson,42146,44871
1,Andrews,70121,69369
2,Angelina,44185,47917
3,Aransas,44851,49153
4,Archer,62407,59010


## Prepare unemployment data

In [87]:
unemp = pd.read_csv('./Data/unemployment_rate_by_county.csv')

In [88]:
unemp.head()

Unnamed: 0,county_id,county_name,2012,2016,2017,2018,2019,2020
0,1,Anderson,0.036,0.043,0.037,0.031,0.029,0.056
1,2,Andrews,0.031,0.047,0.031,0.024,0.023,0.082
2,3,Angelina,0.051,0.059,0.051,0.044,0.04,0.076
3,4,Aransas,0.068,0.056,0.069,0.06,0.043,0.085
4,5,Archer,0.034,0.043,0.034,0.031,0.028,0.053


In [89]:
unemp.drop(columns = ['2017', '2018', '2019', 'county_id'], inplace = True)

In [90]:
unemp['2012_unemp'] = unemp['2012'].apply(lambda x: round(x,6))
unemp['2016_unemp'] = unemp['2016'].apply(lambda x: round(x,6))
unemp['2020_unemp'] = unemp['2020'].apply(lambda x: round(x,6))

unemp.drop(columns = ['2012', '2016', '2020'], inplace = True)

In [91]:
#we'll use the statewide unemployment data to create a column for the difference between a counties unemployment rate and the statewide rate
state_unemp = pd.read_csv('./Data/StatewideEnemployment.csv', index_col = 'DATE')

unemp_2012 = state_unemp.loc['2013-01-01']/100
unemp_2016 = state_unemp.loc['2017-01-01']/100
unemp_2020 = state_unemp.loc['2021-01-01']/100

unemp['2012_var_state'] = unemp['2012_unemp'].apply(lambda x: round(x-unemp_2012,6))
unemp['2016_var_state'] = unemp['2016_unemp'].apply(lambda x: round(x-unemp_2016,6))
unemp['2020_var_state'] = unemp['2020_unemp'].apply(lambda x: round(x-unemp_2020,6))

In [92]:
unemp.head()

Unnamed: 0,county_name,2012_unemp,2016_unemp,2020_unemp,2012_var_state,2016_var_state,2020_var_state
0,Anderson,0.036,0.043,0.056,-0.029,-0.005,-0.012
1,Andrews,0.031,0.047,0.082,-0.034,-0.001,0.014
2,Angelina,0.051,0.059,0.076,-0.014,0.011,0.008
3,Aransas,0.068,0.056,0.085,0.003,0.008,0.017
4,Archer,0.034,0.043,0.053,-0.031,-0.005,-0.015


merge the two dataframes together

In [93]:
df = unemp.merge(right = income, how = 'left', on = 'county_name')

In [94]:
df.head()

Unnamed: 0,county_name,2012_unemp,2016_unemp,2020_unemp,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars
0,Anderson,0.036,0.043,0.056,-0.029,-0.005,-0.012,42146,44871
1,Andrews,0.031,0.047,0.082,-0.034,-0.001,0.014,70121,69369
2,Angelina,0.051,0.059,0.076,-0.014,0.011,0.008,44185,47917
3,Aransas,0.068,0.056,0.085,0.003,0.008,0.017,44851,49153
4,Archer,0.034,0.043,0.053,-0.031,-0.005,-0.015,62407,59010


In [95]:
#check we still have 254 counties
df.shape

(254, 9)

## Prepare demographic data

In [96]:
demo_12 = pd.read_csv('./Data/2012_demo.csv')
demo_16 = pd.read_csv('./Data/2016_demo.csv')
demo_19 = pd.read_csv('./Data/2019_demo.csv')

In [97]:
def clean_demo_df(df, yr):
    df = df[df['Age'] == 'All Ages']
    df = df[df['County'] != 'STATE OF TEXAS']
    df.reset_index(inplace = True)
    df.drop(columns = ['Age', 'FIPS', 'index', 'County'], inplace = True)
    cols = df.columns
    mydict = {}
    for col in cols:
        mydict[col] = col + yr
    cols = df.columns.drop('Total')
    #totals = df['Total']
    for col in cols:
        df[col] = df[col]/df['Total']
        #df[col] = df[col].apply(lambda x: 100* (x/df['Total']))
    for col in cols:
        df[col] = df[col].apply(lambda x: round(x, 4))
    df = df.rename(columns=mydict)
    
    return df


#demo_12 = clean_demo_df(demo_12, '12')
#demo_16 = clean_demo_df(demo_16, '16')
#demo_19 = clean_demo_df(demo_19, '19')


In [98]:
demo_12.shape

(22185, 18)

In [99]:
#df = df.merge(right = demo_12, how = 'left', left_index = True, right_index = True)
#df = df.merge(right = demo_16, how = 'left', left_index = True, right_index = True)
#df = df.merge(right = demo_19, how = 'left', left_index = True, right_index = True)

In [100]:
df.head()

Unnamed: 0,county_name,2012_unemp,2016_unemp,2020_unemp,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars
0,Anderson,0.036,0.043,0.056,-0.029,-0.005,-0.012,42146,44871
1,Andrews,0.031,0.047,0.082,-0.034,-0.001,0.014,70121,69369
2,Angelina,0.051,0.059,0.076,-0.014,0.011,0.008,44185,47917
3,Aransas,0.068,0.056,0.085,0.003,0.008,0.017,44851,49153
4,Archer,0.034,0.043,0.053,-0.031,-0.005,-0.015,62407,59010


In [101]:
df.shape

(254, 9)

In [102]:
df.to_csv('Jacks_df.csv')