## Combining dataframes from group members into one dataframe

In [97]:
import pandas as pd

In [98]:
jack = pd.read_csv('Jacks_df.csv', index_col = 'Unnamed: 0')

In [99]:
jack['county'] = jack['county_name'].apply(lambda x: x.lower() + ' county')
index = jack['county']

In [100]:
jack.drop(columns = 'county_name', inplace = True)

In [101]:
jack.set_index('county', inplace = True)

In [102]:
jack.shape

(254, 8)

In [103]:
voting_nums = pd.read_csv('./data/voting_numbers.csv', index_col = 'Unnamed: 0')

In [104]:
voting_nums.head()

Unnamed: 0,county,2012_reg_voters,2012_voted_num,2012_voted_perc,2012_early_vote_num,2012_early_vote_perc,2016_reg_voters,2016_voted_num,2016_voted_perc,2016_early_vote_num,2016_early_vote_perc,2020_reg_voters,2020_voted_num,2020_voted_perc,2020_early_vote_num,2020_early_vote_perc
0,bosque county,11977,7343,61.31%,2563,21.40%,12002,7823,65.18%,3898,32.48%,12724,9094,71.47%,6535,71.86%
1,brown county,22565,13944,61.79%,8713,38.61%,23424,14018,59.84%,9986,42.63%,23954,15940,66.54%,12544,78.70%
2,jimhogg county,3804,1667,43.82%,590,15.51%,3942,2119,53.75%,1331,33.76%,3800,2034,53.53%,1444,70.99%
3,austin county,18005,11644,64.67%,6046,33.58%,19263,12273,63.71%,7351,38.16%,20293,14368,70.80%,11550,80.39%
4,franklin county,6663,4256,63.88%,2350,35.27%,6798,4377,64.39%,2746,40.39%,7061,5000,70.81%,3938,78.76%


In [105]:
voting_nums.drop_duplicates(inplace = True)

In [106]:
voting_nums.shape
voting_nums.set_index('county', inplace = True)

In [107]:
voting_nums.dtypes

2012_reg_voters         object
2012_voted_num          object
2012_voted_perc         object
2012_early_vote_num     object
2012_early_vote_perc    object
2016_reg_voters         object
2016_voted_num          object
2016_voted_perc         object
2016_early_vote_num     object
2016_early_vote_perc    object
2020_reg_voters         object
2020_voted_num          object
2020_voted_perc         object
2020_early_vote_num     object
2020_early_vote_perc    object
dtype: object

In [108]:
voting_nums.head()

Unnamed: 0_level_0,2012_reg_voters,2012_voted_num,2012_voted_perc,2012_early_vote_num,2012_early_vote_perc,2016_reg_voters,2016_voted_num,2016_voted_perc,2016_early_vote_num,2016_early_vote_perc,2020_reg_voters,2020_voted_num,2020_voted_perc,2020_early_vote_num,2020_early_vote_perc
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
bosque county,11977,7343,61.31%,2563,21.40%,12002,7823,65.18%,3898,32.48%,12724,9094,71.47%,6535,71.86%
brown county,22565,13944,61.79%,8713,38.61%,23424,14018,59.84%,9986,42.63%,23954,15940,66.54%,12544,78.70%
jimhogg county,3804,1667,43.82%,590,15.51%,3942,2119,53.75%,1331,33.76%,3800,2034,53.53%,1444,70.99%
austin county,18005,11644,64.67%,6046,33.58%,19263,12273,63.71%,7351,38.16%,20293,14368,70.80%,11550,80.39%
franklin county,6663,4256,63.88%,2350,35.27%,6798,4377,64.39%,2746,40.39%,7061,5000,70.81%,3938,78.76%


In [109]:
def percent_to_float(percent):
    """
    Converts string percentage to decimal proportion
    percent - string of number ending in "%" to convert to float
    """
    return float(percent[:-1]) / 100

In [110]:
perc_list = ['2012_voted_perc', '2012_early_vote_perc', 
             '2016_voted_perc', '2016_early_vote_perc', 
             '2020_voted_perc', '2020_early_vote_perc']

In [111]:
for col in perc_list:
    voting_nums[col] = voting_nums[col].apply(percent_to_float)

In [112]:
for col in voting_nums.columns:
    if voting_nums[col].dtype == 'object':
        voting_nums[col] = voting_nums[col].apply(lambda x: x.replace(",", ""))
    
for col in voting_nums.columns:
    voting_nums[col] = voting_nums[col].apply(pd.to_numeric)

In [113]:
jack.dtypes

2012_unemp                            float64
2016_unemp                            float64
2020_unemp                            float64
2012_var_state                        float64
2016_var_state                        float64
2020_var_state                        float64
median_income_2012_2016_16_dollars      int64
median_income_2020_16_dollars           int64
dtype: object

In [114]:
df = jack.merge(right = voting_nums, how = 'left', left_index = True, right_index = True)

In [115]:
df.head()

Unnamed: 0_level_0,2012_unemp,2016_unemp,2020_unemp,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars,2012_reg_voters,2012_voted_num,...,2016_reg_voters,2016_voted_num,2016_voted_perc,2016_early_vote_num,2016_early_vote_perc,2020_reg_voters,2020_voted_num,2020_voted_perc,2020_early_vote_num,2020_early_vote_perc
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
anderson county,0.036,0.043,0.056,-0.029,-0.005,-0.012,42146,44871,26494.0,16168.0,...,28111.0,16887.0,0.6007,10792.0,0.3839,29274.0,19155.0,0.6543,14459.0,0.7548
andrews county,0.031,0.047,0.082,-0.034,-0.001,0.014,70121,69369,8271.0,4478.0,...,9130.0,4941.0,0.5412,3429.0,0.3756,10272.0,5856.0,0.5701,4776.0,0.8156
angelina county,0.051,0.059,0.076,-0.014,0.011,0.008,44185,47917,49317.0,28460.0,...,51610.0,29896.0,0.5793,20428.0,0.3958,53166.0,34628.0,0.6513,28676.0,0.8281
aransas county,0.068,0.056,0.085,0.003,0.008,0.017,44851,49153,15883.0,9646.0,...,17263.0,10486.0,0.6074,8421.0,0.4878,18306.0,12241.0,0.6687,10983.0,0.8772
archer county,0.034,0.043,0.053,-0.031,-0.005,-0.015,62407,59010,6322.0,4163.0,...,6314.0,4277.0,0.6774,2284.0,0.3617,6538.0,4796.0,0.7336,3623.0,0.7554


In [116]:
df.shape

(254, 23)

In [117]:
voting_party_data = pd.read_csv('./data/voting_party_data.csv')

In [118]:
voting_party_data.shape

(254, 33)

In [119]:
voting_party_data.head()[:10]

Unnamed: 0,County,2020_rep_vote_count,2020_rep_vote_perc,2020_dem_vote_count,2020_dem_vote_perc,2020_lib_vote_count,2020_lib_vote_perc,2020_grn_vote_count,2020_grn_vote_perc,2020_oth_vote_count,...,2012_rep_vote_perc,2012_rep_vote_count,2012_dem_vote_perc,2012_dem_vote_count,2012_oth_vote_perc,2012_oth_vote_count,2012_vote_total,2012_CWPP,2016_CWPP,2020_CWPP
0,anderson county,15110,78.59%,3955,20.57%,134,0.70%,22,0.11%,6,...,75.64%,12262,23.52%,3813,0.85%,137,16212,0,0,0
1,andrews county,4943,84.31%,850,14.50%,60,1.02%,10,0.17%,0,...,81.19%,3639,17.74%,795,1.07%,48,4482,0,0,0
2,angelina county,25076,72.53%,9143,26.44%,274,0.79%,75,0.22%,6,...,71.47%,20303,27.58%,7834,0.95%,269,28406,0,0,0
3,aransas county,9239,75.17%,2916,23.73%,103,0.84%,31,0.25%,1,...,70.79%,6830,28.03%,2704,1.18%,114,9648,0,1,1
4,archer county,4300,89.66%,446,9.30%,45,0.94%,4,0.08%,1,...,86.46%,3600,12.61%,525,0.94%,39,4164,0,0,1


In [120]:
voting_party_data.set_index('County', inplace = True)

In [121]:
perc_list = [col for col in voting_party_data.columns if "perc" in col]

In [122]:
for col in perc_list:
    voting_party_data[col] = voting_party_data[col].apply(percent_to_float)

In [123]:
for col in voting_party_data.columns:
    if voting_party_data[col].dtype == 'object':
        voting_party_data[col] = voting_party_data[col].apply(lambda x: x.replace(",", ""))
    
for col in voting_party_data.columns:
    voting_party_data[col] = voting_party_data[col].apply(pd.to_numeric)

In [124]:
voting_party_data.dtypes

2020_rep_vote_count       int64
2020_rep_vote_perc      float64
2020_dem_vote_count       int64
2020_dem_vote_perc      float64
2020_lib_vote_count       int64
2020_lib_vote_perc      float64
2020_grn_vote_count       int64
2020_grn_vote_perc      float64
2020_oth_vote_count       int64
2020_oth_vote_perc      float64
2020_votes_total          int64
2016_rep_vote_counte      int64
2016_rep_vote_perc      float64
2016_dem_vote_count       int64
2016_dem_vote_perc      float64
2016_lib_vote_count       int64
2016_lib_vote_perc      float64
2016_grn_vote_count       int64
2016_grn_vote_perc      float64
2016_oth_vote_count       int64
2016_oth_vote_perc      float64
2016_votes_total          int64
2012_rep_vote_perc      float64
2012_rep_vote_count       int64
2012_dem_vote_perc      float64
2012_dem_vote_count       int64
2012_oth_vote_perc      float64
2012_oth_vote_count       int64
2012_vote_total           int64
2012_CWPP                 int64
2016_CWPP                 int64
2020_CWP

In [125]:
voting_party_data.index.rename('county', inplace=True)

In [126]:
df = df.merge(right = voting_party_data, how = 'left', left_index = True, right_index = True)


In [127]:
df.shape

(254, 55)

In [128]:
df.head()

Unnamed: 0_level_0,2012_unemp,2016_unemp,2020_unemp,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars,2012_reg_voters,2012_voted_num,...,2012_rep_vote_perc,2012_rep_vote_count,2012_dem_vote_perc,2012_dem_vote_count,2012_oth_vote_perc,2012_oth_vote_count,2012_vote_total,2012_CWPP,2016_CWPP,2020_CWPP
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
anderson county,0.036,0.043,0.056,-0.029,-0.005,-0.012,42146,44871,26494.0,16168.0,...,0.7564,12262,0.2352,3813,0.0085,137,16212,0,0,0
andrews county,0.031,0.047,0.082,-0.034,-0.001,0.014,70121,69369,8271.0,4478.0,...,0.8119,3639,0.1774,795,0.0107,48,4482,0,0,0
angelina county,0.051,0.059,0.076,-0.014,0.011,0.008,44185,47917,49317.0,28460.0,...,0.7147,20303,0.2758,7834,0.0095,269,28406,0,0,0
aransas county,0.068,0.056,0.085,0.003,0.008,0.017,44851,49153,15883.0,9646.0,...,0.7079,6830,0.2803,2704,0.0118,114,9648,0,1,1
archer county,0.034,0.043,0.053,-0.031,-0.005,-0.015,62407,59010,6322.0,4163.0,...,0.8646,3600,0.1261,525,0.0094,39,4164,0,0,1


In [129]:
asr = pd.read_csv('./data/asr_12_16_20.csv')

In [130]:
asr.head()

Unnamed: 0,county,total_2012_18 to 27,total_2012_28 to 37,total_2012_38 to 47,total_2012_48 to 57,total_2012_58 to 67,total_2012_68 to 77,total_2012_78 to 87,total_2012_88 to 97,total_2016_18 to 27,...,hispanic_female_2016_78 to 87,hispanic_female_2016_88 to 97,hispanic_female_2020_18 to 27,hispanic_female_2020_28 to 37,hispanic_female_2020_38 to 47,hispanic_female_2020_48 to 57,hispanic_female_2020_58 to 67,hispanic_female_2020_68 to 77,hispanic_female_2020_78 to 87,hispanic_female_2020_88 to 97
0,anderson county,7387.0,9358.0,9419.0,8915.0,6285.0,3832.0,2313.0,,7062.0,...,36.0,,633.0,466.0,460.0,360.0,207.0,153.0,40.0,20.0
1,andrews county,2209.0,2050.0,1909.0,2251.0,1473.0,884.0,621.0,,2450.0,...,99.0,,1054.0,1131.0,809.0,541.0,362.0,213.0,92.0,30.0
2,angelina county,11619.0,10862.0,11243.0,11881.0,9576.0,6062.0,4098.0,,12110.0,...,108.0,,1654.0,1335.0,1337.0,971.0,666.0,342.0,123.0,30.0
3,aransas county,2353.0,2019.0,2457.0,3557.0,3908.0,3248.0,1753.0,,2458.0,...,130.0,,621.0,596.0,454.0,410.0,362.0,243.0,121.0,31.0
4,archer county,1022.0,832.0,1156.0,1587.0,1111.0,776.0,462.0,,1213.0,...,11.0,,67.0,49.0,42.0,43.0,25.0,7.0,12.0,5.0


In [132]:
asr.set_index('county', inplace = True)

In [133]:
df = df.merge(right = asr, how = 'left', left_index = True, right_index = True)



In [134]:
df.head()

Unnamed: 0_level_0,2012_unemp,2016_unemp,2020_unemp,2012_var_state,2016_var_state,2020_var_state,median_income_2012_2016_16_dollars,median_income_2020_16_dollars,2012_reg_voters,2012_voted_num,...,hispanic_female_2016_78 to 87,hispanic_female_2016_88 to 97,hispanic_female_2020_18 to 27,hispanic_female_2020_28 to 37,hispanic_female_2020_38 to 47,hispanic_female_2020_48 to 57,hispanic_female_2020_58 to 67,hispanic_female_2020_68 to 77,hispanic_female_2020_78 to 87,hispanic_female_2020_88 to 97
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
anderson county,0.036,0.043,0.056,-0.029,-0.005,-0.012,42146,44871,26494.0,16168.0,...,36.0,,633.0,466.0,460.0,360.0,207.0,153.0,40.0,20.0
andrews county,0.031,0.047,0.082,-0.034,-0.001,0.014,70121,69369,8271.0,4478.0,...,99.0,,1054.0,1131.0,809.0,541.0,362.0,213.0,92.0,30.0
angelina county,0.051,0.059,0.076,-0.014,0.011,0.008,44185,47917,49317.0,28460.0,...,108.0,,1654.0,1335.0,1337.0,971.0,666.0,342.0,123.0,30.0
aransas county,0.068,0.056,0.085,0.003,0.008,0.017,44851,49153,15883.0,9646.0,...,130.0,,621.0,596.0,454.0,410.0,362.0,243.0,121.0,31.0
archer county,0.034,0.043,0.053,-0.031,-0.005,-0.015,62407,59010,6322.0,4163.0,...,11.0,,67.0,49.0,42.0,43.0,25.0,7.0,12.0,5.0


In [135]:
df.shape

(254, 415)

In [136]:
df.to_csv('combined_df.csv')