In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd

## Cleaning the racial data

In [2]:
# Read csv file to dataframe
race =  pd.read_csv('data/race_county_data/original_georgia_race_data.csv', skiprows = 1)

pd.options.display.width=None
pd.set_option('max_row', None)
pd.set_option('display.max_rows', race.shape[0] + 1)
pd.set_option('display.expand_frame_repr', False)

In [3]:
print(race.columns)
print(race.shape)

Index(['id', 'Geographic Area Name', ' !!Total:',
       ' !!Total:!!Hispanic or Latino', ' !!Total:!!Not Hispanic or Latino:',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone',
       ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone',
       ' !!Total:!!Not Hispanic or Latino:!!Population of two or more races:',
       ' !!Total:!!Not Hispanic or Latino:!!Population of two or more races:!!Population of two races:',
       ' !!Total:!!Not Hi

There's a weird space at the start of each column name so we can remove that. We will combine the columns with the data on the number of people that identify their race as 'American Indian and Alaska Native alone, Native Hawaiian' and 'Other Pacific Islander alone' and 'Some Other Race alone' into an 'Others' column. We wil also combine the data on the number of people that identify as more than one race as 'Mixed'.

In [4]:
race.columns = race.columns.str.lstrip()

race.columns = race.columns.map(str)

race.rename(columns={'!!Total:!!Not Hispanic or Latino:!!Population of two or more races:':'Mixed'}, inplace = True)

race['Others'] = race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone'] + race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone'] + race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone']



Then we create a new datafram with only the columns we want: 'Total', 'White', 'Black', 'Asian' and the previously created columns of 'Others' and 'Mixed. We will also rename the columns accordingly.

In [5]:
# Create a new dataframe with the desired columns
race = race[['Geographic Area Name', 'id', '!!Total:', '!!Total:!!Hispanic or Latino', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone','!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone', 'Mixed', 'Others' ]].copy()

# Rename the columns
race.columns = ['Area Name', 'id', 'Total', 'Hispanic', 'White', 'Black', 'Asian', 'Mixed', 'Others']

In [6]:
print(race.head())

                  Area Name              id  Total  Hispanic  White  Black  Asian  Mixed  Others
0   Appling County, Georgia  0500000US13001  18444      1825  12674   3339    123    417      66
1  Atkinson County, Georgia  0500000US13003   8286      2048   4801   1208     12    167      50
2     Bacon County, Georgia  0500000US13005  11140       875   8103   1747     40    335      40
3     Baker County, Georgia  0500000US13007   2876       143   1514   1128     18     70       3
4   Baldwin County, Georgia  0500000US13009  43799      1139  22432  18318    599   1027     284


In [7]:
print(race.head(20))


                   Area Name              id   Total  Hispanic  White  Black  Asian  Mixed  Others
0    Appling County, Georgia  0500000US13001   18444      1825  12674   3339    123    417      66
1   Atkinson County, Georgia  0500000US13003    8286      2048   4801   1208     12    167      50
2      Bacon County, Georgia  0500000US13005   11140       875   8103   1747     40    335      40
3      Baker County, Georgia  0500000US13007    2876       143   1514   1128     18     70       3
4    Baldwin County, Georgia  0500000US13009   43799      1139  22432  18318    599   1027     284
5      Banks County, Georgia  0500000US13011   18035      1164  15578    394    189    620      90
6     Barrow County, Georgia  0500000US13013   83505     10560  55582  10141   3233   3383     606
7     Bartow County, Georgia  0500000US13015  108901     10751  80159  11309   1169   4753     760
8   Ben Hill County, Georgia  0500000US13017   17194      1054   9219   6222    116    478     105
9    Berri

Creating rows with density. 

In [9]:
for ind, row in race.iterrows():
  race.loc[ind,"Population Density:Hispanic"]= row ['Hispanic']/row['Total']
  race.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

We get the error that we have a division by 0.  159 counties

In [None]:
race.loc[race['Total']==0]

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
208,"Fort Stewart CCD, Bryan County, Georgia",0600000US1302991236,0,0,0,0,0,0,0,,,,,,


In [12]:
race.tail(20)

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
139,"Treutlen County, Georgia",0500000US13283,6406,170,4065,1999,7,139,26,0.026538,0.634561,0.312051,0.001093,0.021698,0.004059
140,"Troup County, Georgia",0500000US13285,69426,2956,38099,24157,1608,2204,402,0.042578,0.548771,0.347953,0.023161,0.031746,0.00579
141,"Turner County, Georgia",0500000US13287,9006,372,4700,3644,49,209,32,0.041306,0.521874,0.404619,0.005441,0.023207,0.003553
142,"Twiggs County, Georgia",0500000US13289,8022,124,4487,3099,37,241,34,0.015457,0.559337,0.386313,0.004612,0.030042,0.004238
143,"Union County, Georgia",0500000US13291,24632,816,22646,126,100,807,137,0.033128,0.919373,0.005115,0.00406,0.032762,0.005562
144,"Upson County, Georgia",0500000US13293,27700,633,18009,7851,151,886,170,0.022852,0.650144,0.28343,0.005451,0.031986,0.006137
145,"Walker County, Georgia",0500000US13295,67654,1685,59654,2840,293,2804,378,0.024906,0.881751,0.041978,0.004331,0.041446,0.005587
146,"Walton County, Georgia",0500000US13297,96673,5228,68499,17136,1409,3617,784,0.054079,0.708564,0.177257,0.014575,0.037415,0.00811
147,"Ware County, Georgia",0500000US13299,36251,1612,22275,10703,333,1121,207,0.044468,0.614466,0.295247,0.009186,0.030923,0.00571
148,"Warren County, Georgia",0500000US13301,5215,53,1974,3047,15,97,29,0.010163,0.378523,0.584276,0.002876,0.0186,0.005561


Downloading the dataframes into new files.

In [13]:
race.to_csv('data/race_county_data/cleaned_georgia_race_county.csv')


In [None]:
print(race.shape)

(745, 9)


# Preparing the polling site data

Had to use this: https://stackoverflow.com/questions/45690830/reading-in-csv-file-to-pandas-fails

In [32]:
# Read csv file to dataframe
polling_site =  pd.read_csv('data/polling_site_data/original_polling_site_data_2.csv', encoding="utf-16", sep='\t')


In [33]:
print(polling_site)

       County                      Polling Place First Check-In Last Check-In  Active Registered Voters  Total Election Day Check-Ins  Total EV  Voters/Hour
0     APPLING                    BAXLEY CITY GYM       07:16:00      19:07:00                      1160                           163       275        13.76
1     APPLING  BIG OAKS CH. OF GOD (SOCIAL HALL)       07:05:00      18:50:00                      1108                           237       247        20.17
2     APPLING       EXTENSION EDUCATION BLDG/ 4H       07:04:00      18:56:00                      1573                           195       433        16.43
3     APPLING      BAX CH OF GOD/FELLOWSHIP HALL       07:03:00      18:56:00                      1462                           373       453        31.39
4     APPLING   LIONS CLUB BLDG/ AT FAIR GROUNDS       07:03:00      18:53:00                      2226                           313       442        26.45
...       ...                                ...          

In [48]:
polling_site['County'].value_counts()

DEKALB           168
FULTON           167
GWINNETT         156
COBB             143
CHATHAM           90
CLAYTON           58
COLUMBIA          46
RICHMOND          42
CHEROKEE          39
HENRY             37
FAYETTE           35
BIBB              31
HALL              31
CARROLL           28
DOUGHERTY         28
FLOYD             25
MUSCOGEE          25
COWETA            25
CLARKE            24
DOUGLAS           23
WHITFIELD         23
NEWTON            22
WALTON            21
FORSYTH           20
GLYNN             20
PAULDING          19
COLQUITT          19
SPALDING          18
THOMAS            17
EFFINGHAM         17
ROCKDALE          16
DODGE             16
BURKE             16
BULLOCH           16
HOUSTON           16
BARTOW            16
LAURENS           16
WORTH             15
BALDWIN           14
MERIWETHER        14
TROUP             14
MONROE            14
GILMER            13
LIBERTY           13
HARALSON          12
TIFT              12
FANNIN            12
GORDON       

In [58]:
counties = polling_site['County'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties)



            County  Number of polling sites
0           DEKALB                      168
1           FULTON                      167
2         GWINNETT                      156
3             COBB                      143
4          CHATHAM                       90
5          CLAYTON                       58
6         COLUMBIA                       46
7         RICHMOND                       42
8         CHEROKEE                       39
9            HENRY                       37
10         FAYETTE                       35
11            BIBB                       31
12            HALL                       31
13         CARROLL                       28
14       DOUGHERTY                       28
15           FLOYD                       25
16        MUSCOGEE                       25
17          COWETA                       25
18          CLARKE                       24
19         DOUGLAS                       23
20       WHITFIELD                       23
21          NEWTON              

In [61]:
counties.columns = counties.columns.map(str)
counties = counties.astype({'County': str, 'Number of polling sites': int})

counties['County'] = counties['County'].str.capitalize()

print(counties)

            County  Number of polling sites
0           Dekalb                      168
1           Fulton                      167
2         Gwinnett                      156
3             Cobb                      143
4          Chatham                       90
5          Clayton                       58
6         Columbia                       46
7         Richmond                       42
8         Cherokee                       39
9            Henry                       37
10         Fayette                       35
11            Bibb                       31
12            Hall                       31
13         Carroll                       28
14       Dougherty                       28
15           Floyd                       25
16        Muscogee                       25
17          Coweta                       25
18          Clarke                       24
19         Douglas                       23
20       Whitfield                       23
21          Newton              

In [62]:
counties.to_csv('data/polling_site_data/polling_sites_in_counties.csv')

## Viewing the shapefile and checking if merging works

In [None]:
shapefile = gpd.read_file("data/georgia_shapefiles/2018Precincts.shp")

shapefile.head()

Unnamed: 0,loc_prec,locality,prec_shp,prec_elec,G18DATG,G18DCmAg,G18DCmIns,G18DCmLab,G18DGOV,G18DLTG,...,NHblack,hispanic,totVAP,WVAP,BVAP,HVAP,CD,HD,SD,geometry
0,"Fulton,08P",Fulton,08P,08P,732.0,699.0,693.0,709.0,768.0,736.0,...,480.0,65.0,1606.0,801.0,437.0,61.0,5,55,38,"POLYGON ((-84.39979 33.79360, -84.40059 33.792..."
1,"Fulton,Ss09B",Fulton,SS09B,SS09B,1022.0,912.0,996.0,958.0,1123.0,1053.0,...,102.0,140.0,2898.0,2611.0,69.0,84.0,11,52,6,"POLYGON ((-84.38921 33.87892, -84.38906 33.878..."
2,"Fulton,03A",Fulton,03A,03A,641.0,627.0,614.0,639.0,677.0,605.0,...,2206.0,36.0,1779.0,28.0,1700.0,21.0,5,56,39,"POLYGON ((-84.43052 33.75951, -84.43048 33.759..."
3,"Fulton,07J",Fulton,07J,07J,1037.0,951.0,988.0,983.0,1129.0,1067.0,...,408.0,185.0,2915.0,2245.0,382.0,147.0,5,55,39,"POLYGON ((-84.38250 33.81341, -84.38246 33.813..."
4,"Fulton,09E",Fulton,09E,09E,1249.0,1243.0,1229.0,1254.0,1340.0,1247.0,...,2824.0,60.0,2182.0,119.0,1960.0,46.0,5,53,38,"POLYGON ((-84.43785 33.77598, -84.44003 33.775..."


In [None]:
print(race[race['Area Name'].str.contains('Fulton')])
print(race[race['Area Name'].str.contains('Columbia')])

                                           Area Name                   id    Total  Hispanic   White   Black  Asian  Mixed  Others
59                            Fulton County, Georgia       0500000US13121  1066710     86302  404793  448803  80632  37797    8383
379              Atlanta CCD, Fulton County, Georgia  0600000US1312190144   625717     45997  231272  291119  30735  21959    4635
380         College Park CCD, Fulton County, Georgia  0600000US1312190732    12679       771    1708    9643    105    349     103
381           East Point CCD, Fulton County, Georgia  0600000US1312191020    37414      3947    3322   28569    229   1060     287
382  Fairburn-Union City CCD, Fulton County, Georgia  0600000US1312191128    99509      5759    4323   85637    642   2299     849
383             Palmetto CCD, Fulton County, Georgia  0600000US1312192298     8420       784    3196    4054     52    281      53
384   Roswell-Alpharetta CCD, Fulton County, Georgia  0600000US1312192604   282971 

The names of the area are different. Will remove ',', 'County' and ',Georgia from the race data. Then will create a new column joining the columns 'prec_shp' and 'locality'.

In [None]:
def remove_string(string):
    race['Area Name'] = race['Area Name'].str.replace(str(string), '')
    race['Area Name'] = race['Area Name'].str.strip()

remove_string(',')
remove_string('County')
remove_string('Georgia')

In [None]:
shapefile['Area Name'] = shapefile['prec_shp'].str.upper() + ' ' + shapefile['locality'].str.capitalize()

shapefile['Area Name'].head(5)

0      08P Fulton
1    SS09B Fulton
2      03A Fulton
3      07J Fulton
4      09E Fulton
Name: Area Name, dtype: object

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])

                          Area Name                   id    Total  Hispanic   White   Black  Asian  Mixed  Others
59                           Fulton       0500000US13121  1066710     86302  404793  448803  80632  37797    8383
379              Atlanta CCD Fulton  0600000US1312190144   625717     45997  231272  291119  30735  21959    4635
380         College Park CCD Fulton  0600000US1312190732    12679       771    1708    9643    105    349     103
381           East Point CCD Fulton  0600000US1312191020    37414      3947    3322   28569    229   1060     287
382  Fairburn-Union City CCD Fulton  0600000US1312191128    99509      5759    4323   85637    642   2299     849
383             Palmetto CCD Fulton  0600000US1312192298     8420       784    3196    4054     52    281      53
384   Roswell-Alpharetta CCD Fulton  0600000US1312192604   282971     29044  160972   29781  48869  11849    2456


In [None]:
shapefile[shapefile['Area Name'].str.contains('Appling')]

Unnamed: 0,loc_prec,locality,prec_shp,prec_elec,G18DATG,G18DCmAg,G18DCmIns,G18DCmLab,G18DGOV,G18DLTG,...,hispanic,totVAP,WVAP,BVAP,HVAP,CD,HD,SD,geometry,Area Name
512,"Appling,3A1",Appling,3a1,3A1,23.0,16.0,19.0,21.0,16.0,19.0,...,28.0,852.0,823.0,7.0,16.0,12,156,19,"POLYGON ((-82.08724 31.80198, -82.08725 31.799...",3A1 Appling
513,"Appling,5A",Appling,5a,5A,88.0,81.0,77.0,81.0,77.0,79.0,...,345.0,1354.0,1058.0,70.0,203.0,12,156,19,"POLYGON ((-82.45394 31.73601, -82.45390 31.736...",5A Appling
514,"Appling,2",Appling,2,2,711.0,694.0,684.0,699.0,761.0,664.0,...,367.0,2613.0,959.0,1448.0,196.0,12,156,19,"POLYGON ((-82.26727 31.77289, -82.26726 31.772...",2 Appling
515,"Appling,4B",Appling,4b,4B,19.0,16.0,14.0,13.0,13.0,18.0,...,65.0,985.0,895.0,47.0,35.0,12,178,19,"POLYGON ((-82.15020 31.46973, -82.15024 31.469...",4B Appling
516,"Appling,1B",Appling,1b,1B,93.0,80.0,83.0,89.0,78.0,78.0,...,69.0,1481.0,1361.0,67.0,38.0,12,156,19,"POLYGON ((-82.41077 31.86865, -82.41198 31.868...",1B Appling
517,"Appling,3C",Appling,3c,3C,205.0,191.0,193.0,194.0,203.0,195.0,...,158.0,1938.0,1426.0,375.0,103.0,12,178,19,"POLYGON ((-82.31675 31.72263, -82.31777 31.718...",3C Appling
518,"Appling,5B",Appling,5b,5B,102.0,93.0,91.0,94.0,90.0,88.0,...,117.0,1320.0,1134.0,107.0,63.0,12,156,19,"POLYGON ((-82.40153 31.69728, -82.40157 31.697...",5B Appling
519,"Appling,4D",Appling,4d,4D,95.0,77.0,84.0,84.0,80.0,82.0,...,310.0,1717.0,1392.0,61.0,215.0,12,178,19,"POLYGON ((-82.31244 31.59184, -82.31369 31.592...",4D Appling
520,"Appling,1C",Appling,1c,1C,48.0,41.0,42.0,40.0,40.0,43.0,...,245.0,1282.0,920.0,197.0,147.0,12,156,19,"POLYGON ((-82.36286 31.78434, -82.36296 31.783...",1C Appling


In [None]:
merged = pd.merge(race, shapefile, left_on=race['Area Name'].str.casefold(), right_on=shapefile['Area Name'].str.casefold(), how="left")

merged.head(5)

Unnamed: 0,key_0,Area Name_x,id,Total,Hispanic,White,Black,Asian,Mixed,Others,...,hispanic,totVAP,WVAP,BVAP,HVAP,CD,HD,SD,geometry,Area Name_y
0,appling,Appling,0500000US13001,18444,1825,12674,3339,123,417,66,...,,,,,,,,,,
1,atkinson,Atkinson,0500000US13003,8286,2048,4801,1208,12,167,50,...,,,,,,,,,,
2,bacon,Bacon,0500000US13005,11140,875,8103,1747,40,335,40,...,,,,,,,,,,
3,baker,Baker,0500000US13007,2876,143,1514,1128,18,70,3,...,,,,,,,,,,
4,baldwin,Baldwin,0500000US13009,43799,1139,22432,18318,599,1027,284,...,,,,,,,,,,


In [None]:
merged_isna = merged[merged.isna().any(axis=1)]

exogenous random, not related to the research

In [None]:
print(merged_isna)

                                          key_0                                 Area Name_x                   id    Total  Hispanic   White   Black   Asian  Mixed  Others  ... hispanic totVAP WVAP BVAP  HVAP   CD   HD   SD  geometry  Area Name_y
0                                       appling                                     Appling       0500000US13001    18444      1825   12674    3339     123    417      66  ...      NaN    NaN  NaN  NaN   NaN  NaN  NaN  NaN      None          NaN
1                                      atkinson                                    Atkinson       0500000US13003     8286      2048    4801    1208      12    167      50  ...      NaN    NaN  NaN  NaN   NaN  NaN  NaN  NaN      None          NaN
2                                         bacon                                       Bacon       0500000US13005    11140       875    8103    1747      40    335      40  ...      NaN    NaN  NaN  NaN   NaN  NaN  NaN  NaN      None          NaN
3               