In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd

## Cleaning the racial data by county level

In [None]:
# Read csv file to dataframe
race_county =  pd.read_csv('data/race_county_data/original_georgia_race_data.csv', skiprows = 1)

pd.options.display.width=None
pd.set_option('max_row', None)
pd.set_option('display.max_rows', race_county.shape[0] + 1)
pd.set_option('display.expand_frame_repr', False)

race_county

In [None]:
print(race_county.columns)
print(race_county.shape)

There's a weird space at the start of each column name so we can remove that. We will combine the columns with the data on the number of people that identify their race as 'American Indian and Alaska Native alone, Native Hawaiian' and 'Other Pacific Islander alone' and 'Some Other Race alone' into an 'Others' column. We wil also combine the data on the number of people that identify as more than one race as 'Mixed'.

In [None]:
race_county.columns = race_county.columns.str.lstrip()

race_county.columns = race_county.columns.map(str)

race_county.rename(columns={'!!Total:!!Not Hispanic or Latino:!!Population of two or more races:':'Mixed'}, inplace = True)

race_county['Others'] = race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone'] + race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone'] + race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone']



Then we create a new datafram with only the columns we want: 'Total', 'White', 'Black', 'Asian' and the previously created columns of 'Others' and 'Mixed. We will also rename the columns accordingly.

In [None]:
# Create a new dataframe with the desired columns
race_county = race_county[['Geographic Area Name', 'id', '!!Total:', '!!Total:!!Hispanic or Latino', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone','!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone', 'Mixed', 'Others' ]].copy()

# Rename the columns
race_county.columns = ['Area Name', 'id', 'Total', 'Hispanic', 'White', 'Black', 'Asian', 'Mixed', 'Others']

In [None]:
print(race_county.head())

In [None]:
race_county.head(20)


Creating rows with density. 

In [None]:
for ind, row in race_county.iterrows():
  race_county.loc[ind,"Population Density:Hispanic"]= row ['Hispanic']/row['Total']
  race_county.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_county.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_county.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_county.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_county.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race_county.tail(20)

Downloading the dataframes into new files.

In [None]:
race_county.to_csv('data/race_county_data/cleaned_georgia_race_county.csv')


# Cleaning the racial data by precinct level

In [57]:
race_precinct =  pd.read_csv('data/race_precinct_data/cleaned_georgia_race_precinct.csv', index_col=0)

race_precinct

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others
1,"2, Appling County, Georgia",7000000US13001000002,3563,403,1215,1838,10,81,16
2,"1B, Appling County, Georgia",7000000US1300100001B,1834,76,1575,109,3,63,8
3,"1C, Appling County, Georgia",7000000US1300100001C,1538,116,1242,150,5,25,0
4,"3C, Appling County, Georgia",7000000US1300100003C,2515,263,1528,608,39,69,8
5,"4B, Appling County, Georgia",7000000US1300100004B,1321,62,1147,59,5,43,5
...,...,...,...,...,...,...,...,...,...
2694,"MINTON, Worth County, Georgia",7000000US13321000011,731,10,668,34,2,13,4
2695,"BRIDGEBORO, Worth County, Georgia",7000000US13321000012,1802,14,1554,160,9,47,18
2696,"COUNTY LINE, Worth County, Georgia",7000000US13321000014,1253,20,1122,94,2,13,2
2697,"ISABELLA, Worth County, Georgia",7000000US13321000015,1830,46,1589,125,11,45,14


we see that there is an error 'dividing by 0'

In [58]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

ZeroDivisionError: division by zero

so we see which areas have a population total of 0

In [59]:
race_precinct.loc[race_precinct['Total']==0]

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density: Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
142,"FORT STEWART, Bryan County, Georgia",7000000US13029FTSTEW,0,0,0,0,0,0,0,,,,,,
334,"FORT PULASKI MON, Chatham County, Georgia",7000000US130510XFTPU,0,0,0,0,0,0,0,,,,,,
1316,"08F2, Fulton County, Georgia",7000000US131210008F2,0,0,0,0,0,0,0,,,,,,
1325,"12E2, Fulton County, Georgia",7000000US131210012E2,0,0,0,0,0,0,0,,,,,,
1399,"AP01E, Fulton County, Georgia",7000000US131210AP01E,0,0,0,0,0,0,0,,,,,,
1412,"AP12D, Fulton County, Georgia",7000000US131210AP12D,0,0,0,0,0,0,0,,,,,,
1418,"CP04A, Fulton County, Georgia",7000000US131210CP04A,0,0,0,0,0,0,0,,,,,,
1422,"CP053, Fulton County, Georgia",7000000US131210CP053,0,0,0,0,0,0,0,,,,,,
1489,"SC07B, Fulton County, Georgia",7000000US131210SC07B,0,0,0,0,0,0,0,,,,,,
1491,"SC08A, Fulton County, Georgia",7000000US131210SC08A,0,0,0,0,0,0,0,,,,,,


removing the rows with 0

In [None]:
# race_precinct.drop(race_precinct.loc[race_precinct['Total'] == 0].index, inplace=True)

replacing the rows with 0 with NaN

In [60]:
race_precinct.replace(0, np.nan, inplace=True)

In [61]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

replacing NaN values with 0

In [62]:
race_precinct.replace(np.nan, 0, inplace=True)

In [64]:
race_precinct.loc[race_precinct['Total']==0]

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density: Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
142,"FORT STEWART, Bryan County, Georgia",7000000US13029FTSTEW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
334,"FORT PULASKI MON, Chatham County, Georgia",7000000US130510XFTPU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1316,"08F2, Fulton County, Georgia",7000000US131210008F2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1325,"12E2, Fulton County, Georgia",7000000US131210012E2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1399,"AP01E, Fulton County, Georgia",7000000US131210AP01E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1412,"AP12D, Fulton County, Georgia",7000000US131210AP12D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1418,"CP04A, Fulton County, Georgia",7000000US131210CP04A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1422,"CP053, Fulton County, Georgia",7000000US131210CP053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1489,"SC07B, Fulton County, Georgia",7000000US131210SC07B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1491,"SC08A, Fulton County, Georgia",7000000US131210SC08A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
race_precinct

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density: Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
1,"2, Appling County, Georgia",7000000US13001000002,3563.0,403.0,1215.0,1838.0,10.0,81.0,16.0,0.113107,0.341005,0.515857,0.002807,0.022734,0.004491
2,"1B, Appling County, Georgia",7000000US1300100001B,1834.0,76.0,1575.0,109.0,3.0,63.0,8.0,0.041439,0.858779,0.059433,0.001636,0.034351,0.004362
3,"1C, Appling County, Georgia",7000000US1300100001C,1538.0,116.0,1242.0,150.0,5.0,25.0,0.0,0.075423,0.807542,0.097529,0.003251,0.016255,0.000000
4,"3C, Appling County, Georgia",7000000US1300100003C,2515.0,263.0,1528.0,608.0,39.0,69.0,8.0,0.104573,0.607555,0.241750,0.015507,0.027435,0.003181
5,"4B, Appling County, Georgia",7000000US1300100004B,1321.0,62.0,1147.0,59.0,5.0,43.0,5.0,0.046934,0.868282,0.044663,0.003785,0.032551,0.003785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2694,"MINTON, Worth County, Georgia",7000000US13321000011,731.0,10.0,668.0,34.0,2.0,13.0,4.0,0.013680,0.913817,0.046512,0.002736,0.017784,0.005472
2695,"BRIDGEBORO, Worth County, Georgia",7000000US13321000012,1802.0,14.0,1554.0,160.0,9.0,47.0,18.0,0.007769,0.862375,0.088790,0.004994,0.026082,0.009989
2696,"COUNTY LINE, Worth County, Georgia",7000000US13321000014,1253.0,20.0,1122.0,94.0,2.0,13.0,2.0,0.015962,0.895451,0.075020,0.001596,0.010375,0.001596
2697,"ISABELLA, Worth County, Georgia",7000000US13321000015,1830.0,46.0,1589.0,125.0,11.0,45.0,14.0,0.025137,0.868306,0.068306,0.006011,0.024590,0.007650


In [65]:
race_precinct.to_csv('data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv')

# Preparing the polling site data (old data)

Had to use this: https://stackoverflow.com/questions/45690830/reading-in-csv-file-to-pandas-fails

In [None]:
# Read csv file to dataframe
polling_site =  pd.read_csv('data/polling_site_data/original_polling_site_data_2.csv', encoding="utf-16", sep='\t')


In [None]:
polling_site.shape

In [None]:
polling_site['County'].value_counts()

In [None]:
counties = polling_site['County'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties)



In [None]:
counties.columns = counties.columns.map(str)
counties = counties.astype({'County': str, 'Number of polling sites': int})

counties['County'] = counties['County'].str.capitalize()

print(counties)

Now calculating the density

In [None]:
counties['Total Population'] = pd.Series(race_county['Total'])
counties.head()

In [None]:
for ind, row in counties.iterrows():
  counties.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total Population']

In [None]:
counties.head()

In [None]:
counties.to_csv('data/polling_site_data/polling_sites_in_counties.csv')

# Preparing the polling site data (new data)

In [9]:
# Read csv file to dataframe
polling_site_new =  pd.read_csv('data/polling_site_data_new/original.csv')

polling_site_new

Unnamed: 0,election_date,state,county_name,jurisdiction,jurisdiction_type,precinct_id,precinct_name,polling_place_id,location_type,name,address,notes,source,source_date,source_notes
0,2020-11-03,GA,Appling,Appling,county,2,,1012,election_day,LIONS CLUB BLDG/ AT FAIR GROUNDS,"245 INDUSTRIAL DR, BAXLEY, GA 31513",OTHER,ORR,2020-10-19,
1,2020-11-03,GA,Appling,Appling,county,1C,,1018,election_day,1ST ASSEMBLY OF GOD CHURCH,"3397 HATCH PKY N, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
2,2020-11-03,GA,Appling,Appling,county,1B,,1001,election_day,ALTAMAHA FIRE STATION,"392 ALTAMAHA SCHOOL RD, BAXLEY, GA 31513",County Building,ORR,2020-10-19,
3,2020-11-03,GA,Appling,Appling,county,4D,,1006,election_day,BAX CH OF GOD/FELLOWSHIP HALL,"353 BLACKSHEAR HWY, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
4,2020-11-03,GA,Appling,Appling,county,5A,,1002,election_day,BAXLEY CITY GYM,"252 W. PARKER ST., BAXLEY, GA 31513",County Building,ORR,2020-10-19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,2020-11-03,GA,Worth,Worth,county,5,,159014,election_day,SHINGLER FIRE STATION,"126 SHINGLER LITTLE RIVER RD, POULAN, GA 31781",,ORR,2020-10-19,
2674,2020-11-03,GA,Worth,Worth,county,4,,159022,election_day,SUMNER MUNICIPAL COMPLEX,"702 WALNUT ST, SUMNER, GA 31789",,ORR,2020-10-19,
2675,2020-11-03,GA,Worth,Worth,county,14,,159016,election_day,VOLUNTEER FIRE DEPARTMENT,"6104 WILLOW/BIRCH RD, ALBANY, GA 31705",,ORR,2020-10-19,
2676,2020-11-03,GA,Worth,Worth,county,9,,159017,election_day,WARWICK COMM CENTER,"111 DOGWOOD ST SW, WARWICK, GA 31796",,ORR,2020-10-19,


In [10]:
polling_site_new['county_name'].value_counts()

Fulton      394
Dekalb      193
Gwinnett    156
Cobb        145
Chatham      92
           ... 
Warren        1
Stephens      1
Bleckley      1
Lumpkin       1
Lanier        1
Name: county_name, Length: 159, dtype: int64

In [11]:
counties_new = polling_site_new['county_name'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties_new)


       County  Number of polling sites
0      Fulton                      394
1      Dekalb                      193
2    Gwinnett                      156
3        Cobb                      145
4     Chatham                       92
..        ...                      ...
154    Warren                        1
155  Stephens                        1
156  Bleckley                        1
157   Lumpkin                        1
158    Lanier                        1

[159 rows x 2 columns]


In [12]:
counties_new.columns = counties_new.columns.map(str)
counties_new = counties_new.astype({'County': str, 'Number of polling sites': int})

print(counties_new)

       County  Number of polling sites
0      Fulton                      394
1      Dekalb                      193
2    Gwinnett                      156
3        Cobb                      145
4     Chatham                       92
..        ...                      ...
154    Warren                        1
155  Stephens                        1
156  Bleckley                        1
157   Lumpkin                        1
158    Lanier                        1

[159 rows x 2 columns]


In [17]:
race_county =  pd.read_csv('data/race_county_data/cleaned_georgia_race_county.csv', index_col = 0)

race_county['Area Name'] = race_county['Area Name'].str.split(' ').str[0]

race_county.head()

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
0,Appling,0500000US13001,18444,1825,12674,3339,123,417,66,0.098948,0.687161,0.181034,0.006669,0.022609,0.003578
1,Atkinson,0500000US13003,8286,2048,4801,1208,12,167,50,0.247164,0.579411,0.145788,0.001448,0.020154,0.006034
2,Bacon,0500000US13005,11140,875,8103,1747,40,335,40,0.078546,0.727379,0.156822,0.003591,0.030072,0.003591
3,Baker,0500000US13007,2876,143,1514,1128,18,70,3,0.049722,0.526426,0.392211,0.006259,0.024339,0.001043
4,Baldwin,0500000US13009,43799,1139,22432,18318,599,1027,284,0.026005,0.512158,0.418229,0.013676,0.023448,0.006484


In [18]:
total_population = race_county[['Area Name', 'Total']].copy()
total_population.rename(columns={'Area Name': 'County'}, inplace=True)


In [19]:
total_population

Unnamed: 0,County,Total
0,Appling,18444
1,Atkinson,8286
2,Bacon,11140
3,Baker,2876
4,Baldwin,43799
...,...,...
154,Whitfield,102864
155,Wilcox,8766
156,Wilkes,9565
157,Wilkinson,8877


In [20]:
counties_new = counties_new.merge(total_population, on='County', how='inner')
counties_new.head()

Unnamed: 0,County,Number of polling sites,Total
0,Fulton,394,1066710
1,Gwinnett,156,957062
2,Cobb,145,766149
3,Chatham,92,295291
4,Richmond,68,206607


In [21]:
for ind, row in counties_new.iterrows():
  counties_new.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total']

In [22]:
counties_new

Unnamed: 0,County,Number of polling sites,Total,Polling Site Density
0,Fulton,394,1066710,0.000369
1,Gwinnett,156,957062,0.000163
2,Cobb,145,766149,0.000189
3,Chatham,92,295291,0.000312
4,Richmond,68,206607,0.000329
...,...,...,...,...
149,Warren,1,5215,0.000192
150,Stephens,1,26784,0.000037
151,Bleckley,1,12583,0.000079
152,Lumpkin,1,33488,0.000030


In [None]:
counties_new.to_csv('data/polling_site_data_new/clean.csv')

## Finding out which precincts have polling sites

In [23]:
polling_site_precinct =  pd.read_csv('data/polling_site_data_new/original.csv')
race_county =  pd.read_csv('data/race_county_data/cleaned_georgia_race_county.csv', index_col = 0)

polling_site_precinct.columns = polling_site_precinct.columns.map(str)

In [24]:
polling_site_precinct

Unnamed: 0,election_date,state,county_name,jurisdiction,jurisdiction_type,precinct_id,precinct_name,polling_place_id,location_type,name,address,notes,source,source_date,source_notes
0,2020-11-03,GA,Appling,Appling,county,2,,1012,election_day,LIONS CLUB BLDG/ AT FAIR GROUNDS,"245 INDUSTRIAL DR, BAXLEY, GA 31513",OTHER,ORR,2020-10-19,
1,2020-11-03,GA,Appling,Appling,county,1C,,1018,election_day,1ST ASSEMBLY OF GOD CHURCH,"3397 HATCH PKY N, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
2,2020-11-03,GA,Appling,Appling,county,1B,,1001,election_day,ALTAMAHA FIRE STATION,"392 ALTAMAHA SCHOOL RD, BAXLEY, GA 31513",County Building,ORR,2020-10-19,
3,2020-11-03,GA,Appling,Appling,county,4D,,1006,election_day,BAX CH OF GOD/FELLOWSHIP HALL,"353 BLACKSHEAR HWY, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
4,2020-11-03,GA,Appling,Appling,county,5A,,1002,election_day,BAXLEY CITY GYM,"252 W. PARKER ST., BAXLEY, GA 31513",County Building,ORR,2020-10-19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,2020-11-03,GA,Worth,Worth,county,5,,159014,election_day,SHINGLER FIRE STATION,"126 SHINGLER LITTLE RIVER RD, POULAN, GA 31781",,ORR,2020-10-19,
2674,2020-11-03,GA,Worth,Worth,county,4,,159022,election_day,SUMNER MUNICIPAL COMPLEX,"702 WALNUT ST, SUMNER, GA 31789",,ORR,2020-10-19,
2675,2020-11-03,GA,Worth,Worth,county,14,,159016,election_day,VOLUNTEER FIRE DEPARTMENT,"6104 WILLOW/BIRCH RD, ALBANY, GA 31705",,ORR,2020-10-19,
2676,2020-11-03,GA,Worth,Worth,county,9,,159017,election_day,WARWICK COMM CENTER,"111 DOGWOOD ST SW, WARWICK, GA 31796",,ORR,2020-10-19,


In [25]:
race_county

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
0,"Appling County, Georgia",0500000US13001,18444,1825,12674,3339,123,417,66,0.098948,0.687161,0.181034,0.006669,0.022609,0.003578
1,"Atkinson County, Georgia",0500000US13003,8286,2048,4801,1208,12,167,50,0.247164,0.579411,0.145788,0.001448,0.020154,0.006034
2,"Bacon County, Georgia",0500000US13005,11140,875,8103,1747,40,335,40,0.078546,0.727379,0.156822,0.003591,0.030072,0.003591
3,"Baker County, Georgia",0500000US13007,2876,143,1514,1128,18,70,3,0.049722,0.526426,0.392211,0.006259,0.024339,0.001043
4,"Baldwin County, Georgia",0500000US13009,43799,1139,22432,18318,599,1027,284,0.026005,0.512158,0.418229,0.013676,0.023448,0.006484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,"Whitfield County, Georgia",0500000US13313,102864,36916,57875,3553,1394,2621,505,0.358882,0.562636,0.034541,0.013552,0.025480,0.004909
155,"Wilcox County, Georgia",0500000US13315,8766,272,5185,3096,49,149,15,0.031029,0.591490,0.353183,0.005590,0.016997,0.001711
156,"Wilkes County, Georgia",0500000US13317,9565,399,4952,3838,59,277,40,0.041715,0.517721,0.401255,0.006168,0.028960,0.004182
157,"Wilkinson County, Georgia",0500000US13319,8877,239,5110,3163,22,297,46,0.026924,0.575645,0.356314,0.002478,0.033457,0.005182


In [26]:
race_county.columns = race_county.columns.map(str)

In [27]:
race_county

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
0,"Appling County, Georgia",0500000US13001,18444,1825,12674,3339,123,417,66,0.098948,0.687161,0.181034,0.006669,0.022609,0.003578
1,"Atkinson County, Georgia",0500000US13003,8286,2048,4801,1208,12,167,50,0.247164,0.579411,0.145788,0.001448,0.020154,0.006034
2,"Bacon County, Georgia",0500000US13005,11140,875,8103,1747,40,335,40,0.078546,0.727379,0.156822,0.003591,0.030072,0.003591
3,"Baker County, Georgia",0500000US13007,2876,143,1514,1128,18,70,3,0.049722,0.526426,0.392211,0.006259,0.024339,0.001043
4,"Baldwin County, Georgia",0500000US13009,43799,1139,22432,18318,599,1027,284,0.026005,0.512158,0.418229,0.013676,0.023448,0.006484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,"Whitfield County, Georgia",0500000US13313,102864,36916,57875,3553,1394,2621,505,0.358882,0.562636,0.034541,0.013552,0.025480,0.004909
155,"Wilcox County, Georgia",0500000US13315,8766,272,5185,3096,49,149,15,0.031029,0.591490,0.353183,0.005590,0.016997,0.001711
156,"Wilkes County, Georgia",0500000US13317,9565,399,4952,3838,59,277,40,0.041715,0.517721,0.401255,0.006168,0.028960,0.004182
157,"Wilkinson County, Georgia",0500000US13319,8877,239,5110,3163,22,297,46,0.026924,0.575645,0.356314,0.002478,0.033457,0.005182


In [28]:
race_copy = race_county[['id', 'Area Name']]
race_copy.head()

Unnamed: 0,id,Area Name
0,0500000US13001,"Appling County, Georgia"
1,0500000US13003,"Atkinson County, Georgia"
2,0500000US13005,"Bacon County, Georgia"
3,0500000US13007,"Baker County, Georgia"
4,0500000US13009,"Baldwin County, Georgia"


In [29]:
race_copy['id'] = race_copy['id'].str[7:]

In [30]:
race_copy['Area Name'] = race_copy['Area Name'].str.replace(' County, Georgia','')


In [31]:
race_copy.head()

Unnamed: 0,id,Area Name
0,US13001,Appling
1,US13003,Atkinson
2,US13005,Bacon
3,US13007,Baker
4,US13009,Baldwin


In [32]:
merged = pd.merge(polling_site_precinct, race_copy, how="inner", on=None, left_on=['county_name'], right_on=['Area Name'])


In [33]:
polling_site_precinct = merged[['id', 'Area Name', 'precinct_id']]

In [34]:
polling_site_precinct.head(20)

Unnamed: 0,id,Area Name,precinct_id
0,US13001,Appling,2
1,US13001,Appling,1C
2,US13001,Appling,1B
3,US13001,Appling,4D
4,US13001,Appling,5A
5,US13001,Appling,5B
6,US13001,Appling,3C
7,US13001,Appling,3A1
8,US13001,Appling,4B
9,US13003,Atkinson,0001


In [35]:
polling_site_precinct["precinct_id"]= polling_site_precinct["precinct_id"].astype(str)

In [36]:
polling_site_precinct.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2470 entries, 0 to 2469
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           2470 non-null   object
 1   Area Name    2470 non-null   object
 2   precinct_id  2470 non-null   object
dtypes: object(3)
memory usage: 77.2+ KB


In [37]:

polling_site_precinct['precinct_id'] = polling_site_precinct['precinct_id'].str.zfill(6)

In [38]:
polling_site_precinct.head(20)

Unnamed: 0,id,Area Name,precinct_id
0,US13001,Appling,000002
1,US13001,Appling,00001C
2,US13001,Appling,00001B
3,US13001,Appling,00004D
4,US13001,Appling,00005A
5,US13001,Appling,00005B
6,US13001,Appling,00003C
7,US13001,Appling,0003A1
8,US13001,Appling,00004B
9,US13003,Atkinson,000001


In [39]:
polling_site_precinct['Geographic Id'] = polling_site_precinct['id'] + polling_site_precinct['precinct_id']

In [40]:
polling_site_precinct['Geographic Id'] = '7000000' + polling_site_precinct['Geographic Id'].astype(str)

In [41]:
polling_site_precinct = polling_site_precinct[['Area Name', 'Geographic Id']]

In [42]:
polling_site_precinct.rename(columns = {'Area Name': 'County', 'Geographic Id': 'Id'}, inplace = True)

In [43]:
polling_site_precinct

Unnamed: 0,County,Id
0,Appling,7000000US13001000002
1,Appling,7000000US1300100001C
2,Appling,7000000US1300100001B
3,Appling,7000000US1300100004D
4,Appling,7000000US1300100005A
...,...,...
2465,Worth,7000000US13321000005
2466,Worth,7000000US13321000004
2467,Worth,7000000US13321000014
2468,Worth,7000000US13321000009


In [44]:
polling_site_precinct.to_csv('data/polling_site_data_new/clean_precincts.csv')

Merging the dataframe with polling sites with the dataframe with race to give a dataframe with a column with 0's and 1's

In [45]:
polling_site_precinct_binary =  pd.read_csv('data/polling_site_data_new/clean_precincts.csv', index_col = 0)

In [46]:
polling_site_precinct_binary.insert(2, "Polling Site", 1)

In [47]:
polling_site_precinct_binary.head()

Unnamed: 0,County,Id,Polling Site
0,Appling,7000000US13001000002,1
1,Appling,7000000US1300100001C,1
2,Appling,7000000US1300100001B,1
3,Appling,7000000US1300100004D,1
4,Appling,7000000US1300100005A,1


In [66]:
race_precinct = pd.read_csv('data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv', index_col = 0)

In [68]:
print(race_precinct.shape)
print(polling_site_precinct_binary.shape)

(2698, 15)
(2470, 3)


In [69]:
merged = race_precinct.merge(polling_site_precinct_binary, how='left', left_on='id', right_on='Id')

In [70]:
merged_isna = merged[merged.isna().any(axis=1)]
merged_isna

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density: Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others,County,Id,Polling Site
51,"06, Barrow County, Georgia",7000000US13013000006,3656.0,334.0,2430.0,493.0,175.0,179.0,45.0,0.091357,0.664661,0.134847,0.047867,0.048961,0.012309,,,
52,"07, Barrow County, Georgia",7000000US13013000007,4524.0,413.0,3084.0,700.0,114.0,196.0,17.0,0.091291,0.681698,0.154730,0.025199,0.043324,0.003758,,,
54,"09, Barrow County, Georgia",7000000US13013000009,5165.0,698.0,3620.0,494.0,139.0,194.0,20.0,0.135140,0.700871,0.095644,0.026912,0.037561,0.003872,,,
55,"10, Barrow County, Georgia",7000000US13013000010,5978.0,951.0,2796.0,1720.0,172.0,268.0,71.0,0.159083,0.467715,0.287722,0.028772,0.044831,0.011877,,,
56,"11, Barrow County, Georgia",7000000US13013000011,7348.0,1377.0,4001.0,1310.0,256.0,332.0,72.0,0.187398,0.544502,0.178280,0.034839,0.045182,0.009799,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2599,"3015 - RICKETSON Voting District, Warren Count...",7000000US13301000005,429.0,8.0,341.0,64.0,0.0,12.0,4.0,0.018648,0.794872,0.149184,0.000000,0.027972,0.009324,,,
2600,"3016 - PAN HANDLE Voting District, Warren Coun...",7000000US13301000006,113.0,3.0,95.0,10.0,0.0,5.0,0.0,0.026549,0.840708,0.088496,0.000000,0.044248,0.000000,,,
2621,"307G3 - GOOSEBERRY 3 Voting District, Webster ...",7000000US133070000G3,239.0,4.0,188.0,40.0,0.0,6.0,1.0,0.016736,0.786611,0.167364,0.000000,0.025105,0.004184,,,
2623,"307P3 - PRESTON 3 Voting District, Webster Cou...",7000000US133070000P3,297.0,8.0,198.0,78.0,0.0,12.0,1.0,0.026936,0.666667,0.262626,0.000000,0.040404,0.003367,,,


In [71]:
merged.fillna(0, inplace = True)

In [72]:
merged.isna().sum()

Area Name                       0
id                              0
Total                           0
Hispanic                        0
White                           0
Black                           0
Asian                           0
Mixed                           0
Others                          0
Population Density: Hispanic    0
Population Density: White       0
Population Density: Black       0
Population Density: Asian       0
Population Density: Mixed       0
Population Density: Others      0
County                          0
Id                              0
Polling Site                    0
dtype: int64

In [73]:
merged.drop(['Id', 'County'], axis=1, inplace = True)
merged.head()

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density: Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others,Polling Site
0,"2, Appling County, Georgia",7000000US13001000002,3563.0,403.0,1215.0,1838.0,10.0,81.0,16.0,0.113107,0.341005,0.515857,0.002807,0.022734,0.004491,1.0
1,"1B, Appling County, Georgia",7000000US1300100001B,1834.0,76.0,1575.0,109.0,3.0,63.0,8.0,0.041439,0.858779,0.059433,0.001636,0.034351,0.004362,1.0
2,"1C, Appling County, Georgia",7000000US1300100001C,1538.0,116.0,1242.0,150.0,5.0,25.0,0.0,0.075423,0.807542,0.097529,0.003251,0.016255,0.0,1.0
3,"3C, Appling County, Georgia",7000000US1300100003C,2515.0,263.0,1528.0,608.0,39.0,69.0,8.0,0.104573,0.607555,0.24175,0.015507,0.027435,0.003181,1.0
4,"4B, Appling County, Georgia",7000000US1300100004B,1321.0,62.0,1147.0,59.0,5.0,43.0,5.0,0.046934,0.868282,0.044663,0.003785,0.032551,0.003785,1.0


In [74]:
merged['Polling Site'] = merged['Polling Site'].astype(int)

In [75]:
merged.to_csv('data/polling_site_data_new/clean_precincts_with_polling_site.csv')

# (old test) Viewing the shapefile and checking if merging works for precinct

In [None]:
precinct = pd.read_csv("data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv")

precinct.head(10)

In [None]:
shapefile = gpd.read_file("data/test/cb_2020_13_vtd_500k.shp")

shapefile.head(10)

In [None]:
merged = pd.merge(precinct, shapefile, left_on=precinct['id'], right_on=shapefile['AFFGEOID20'], how="left")

In [None]:
merged

In [None]:

# set the value column that will be visualised
variable = 'Hispanic'
# set the range for the choropleth values
vmin, vmax = 0, 100
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(30, 10))
# remove the axis
ax.axis('off')
# add a title and annotation
ax.set_title('White Choropleth Map', fontdict={'fontsize': '25', 'fontweight' : '3'})
#ax.annotate('Source: Wikipedia - https://en.wikipedia.org/wiki/Provinces_of_Indonesia', xy=(0.6, .05), xycoords='figure fraction', fontsize=12, color='#555555')
# Create colorbar legend
sm = plt.cm.ScalarMappable(cmap='Blues', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm.set_array([]) # or alternatively sm._A = []. Not sure why this step is necessary, but many recommends it
# add the colorbar to the figure
fig.colorbar(sm)
# create map
merged.plot(column=variable, cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8')


In [None]:
print(race[race['Area Name'].str.contains('Fulton')])
print(race[race['Area Name'].str.contains('Columbia')])

The names of the area are different. Will remove ',', 'County' and ',Georgia from the race data. Then will create a new column joining the columns 'prec_shp' and 'locality'.

In [None]:
def remove_string(string):
    race['Area Name'] = race['Area Name'].str.replace(str(string), '')
    race['Area Name'] = race['Area Name'].str.strip()

remove_string(',')
remove_string('County')
remove_string('Georgia')

In [None]:
shapefile['Area Name'] = shapefile['prec_shp'].str.upper() + ' ' + shapefile['locality'].str.capitalize()

shapefile['Area Name'].head(5)

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])

In [None]:
shapefile[shapefile['Area Name'].str.contains('Appling')]

In [None]:
merged = pd.merge(race, shapefile, left_on=race['Area Name'].str.casefold(), right_on=shapefile['Area Name'].str.casefold(), how="left")

merged.head(5)

In [None]:
merged_isna = merged[merged.isna().any(axis=1)]

exogenous random, not related to the research

In [None]:
print(merged_isna)

In [None]:
i = 0
j = 0
polling_site_precinct["county_id"] = ""

for i in range (0, len(polling_site_precinct.index)):
    for j in range (0, len(race_copy.index)):
        if polling_site_precinct.loc[i]['county_name'] == race_copy[j]['Geographic Area Name']:
            polling_site_precinct[i]['county_id'] = race_copy[i]['id']


In [None]:
for i in range (0, len(polling_site_precinct.index)):
    polling_site_precinct.at[int(i), 'precinct_id'] = str(polling_site_precinct.at[int(i), 'precinct_id']).rjust(6,"0")