In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd

## Cleaning the racial data

In [None]:
# Read csv file to dataframe
race =  pd.read_csv('data/race_county_data/original_georgia_race_data.csv', skiprows = 1)

pd.options.display.width=None
pd.set_option('max_row', None)
pd.set_option('display.max_rows', race.shape[0] + 1)
pd.set_option('display.expand_frame_repr', False)

In [None]:
print(race.columns)
print(race.shape)

There's a weird space at the start of each column name so we can remove that. We will combine the columns with the data on the number of people that identify their race as 'American Indian and Alaska Native alone, Native Hawaiian' and 'Other Pacific Islander alone' and 'Some Other Race alone' into an 'Others' column. We wil also combine the data on the number of people that identify as more than one race as 'Mixed'.

In [None]:
race.columns = race.columns.str.lstrip()

race.columns = race.columns.map(str)

race.rename(columns={'!!Total:!!Not Hispanic or Latino:!!Population of two or more races:':'Mixed'}, inplace = True)

race['Others'] = race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone'] + race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone'] + race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone']



Then we create a new datafram with only the columns we want: 'Total', 'White', 'Black', 'Asian' and the previously created columns of 'Others' and 'Mixed. We will also rename the columns accordingly.

In [None]:
# Create a new dataframe with the desired columns
race = race[['Geographic Area Name', 'id', '!!Total:', '!!Total:!!Hispanic or Latino', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone','!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone', 'Mixed', 'Others' ]].copy()

# Rename the columns
race.columns = ['Area Name', 'id', 'Total', 'Hispanic', 'White', 'Black', 'Asian', 'Mixed', 'Others']

In [79]:
print(race.head())

                  Area Name              id  Total  Hispanic  White  Black  Asian  Mixed  Others  Population Density:Hispanic  Population Density: White  Population Density: Black  Population Density: Asian  Population Density: Mixed  Population Density: Others
0   Appling County, Georgia  0500000US13001  18444      1825  12674   3339    123    417      66                     0.098948                   0.687161                   0.181034                   0.006669                   0.022609                    0.003578
1  Atkinson County, Georgia  0500000US13003   8286      2048   4801   1208     12    167      50                     0.247164                   0.579411                   0.145788                   0.001448                   0.020154                    0.006034
2     Bacon County, Georgia  0500000US13005  11140       875   8103   1747     40    335      40                     0.078546                   0.727379                   0.156822                   0.003591        

In [80]:
race.head(20)


Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
0,"Appling County, Georgia",0500000US13001,18444,1825,12674,3339,123,417,66,0.098948,0.687161,0.181034,0.006669,0.022609,0.003578
1,"Atkinson County, Georgia",0500000US13003,8286,2048,4801,1208,12,167,50,0.247164,0.579411,0.145788,0.001448,0.020154,0.006034
2,"Bacon County, Georgia",0500000US13005,11140,875,8103,1747,40,335,40,0.078546,0.727379,0.156822,0.003591,0.030072,0.003591
3,"Baker County, Georgia",0500000US13007,2876,143,1514,1128,18,70,3,0.049722,0.526426,0.392211,0.006259,0.024339,0.001043
4,"Baldwin County, Georgia",0500000US13009,43799,1139,22432,18318,599,1027,284,0.026005,0.512158,0.418229,0.013676,0.023448,0.006484
5,"Banks County, Georgia",0500000US13011,18035,1164,15578,394,189,620,90,0.064541,0.863765,0.021846,0.01048,0.034378,0.00499
6,"Barrow County, Georgia",0500000US13013,83505,10560,55582,10141,3233,3383,606,0.126459,0.665613,0.121442,0.038716,0.040513,0.007257
7,"Bartow County, Georgia",0500000US13015,108901,10751,80159,11309,1169,4753,760,0.098723,0.736072,0.103847,0.010735,0.043645,0.006979
8,"Ben Hill County, Georgia",0500000US13017,17194,1054,9219,6222,116,478,105,0.0613,0.536175,0.36187,0.006747,0.0278,0.006107
9,"Berrien County, Georgia",0500000US13019,18160,1045,14396,1934,80,611,94,0.057544,0.792731,0.106498,0.004405,0.033645,0.005176


Creating rows with density. 

In [None]:
for ind, row in race.iterrows():
  race.loc[ind,"Population Density:Hispanic"]= row ['Hispanic']/row['Total']
  race.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race.tail(20)

Downloading the dataframes into new files.

In [None]:
race.to_csv('data/race_county_data/cleaned_georgia_race_county.csv')


In [None]:
print(race.shape)

## Doing the same but for precincts

In [None]:
race_precinct =  pd.read_csv('data/race_precinct_data/cleaned_georgia_race_precinct.csv', index_col=0)

race_precinct

In [None]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

so we see which areas have a population total of 0

In [None]:
race_precinct.loc[race_precinct['Total']==0]

In [None]:
race_precinct.drop(race_precinct.loc[race_precinct['Total'] == 0].index, inplace=True)

In [None]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race_precinct

In [None]:
race_precinct.to_csv('data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv')

# Preparing the polling site data

Had to use this: https://stackoverflow.com/questions/45690830/reading-in-csv-file-to-pandas-fails

In [None]:
# Read csv file to dataframe
polling_site =  pd.read_csv('data/polling_site_data/original_polling_site_data_2.csv', encoding="utf-16", sep='\t')


In [None]:
polling_site.shape

In [None]:
polling_site['County'].value_counts()

In [None]:
counties = polling_site['County'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties)



In [None]:
counties.columns = counties.columns.map(str)
counties = counties.astype({'County': str, 'Number of polling sites': int})

counties['County'] = counties['County'].str.capitalize()

print(counties)

Now calculating the density

In [None]:
counties['Total Population'] = pd.Series(race['Total'])
counties.head()

In [None]:
for ind, row in counties.iterrows():
  counties.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total Population']

In [None]:
counties.head()

In [None]:
counties.to_csv('data/polling_site_data/polling_sites_in_counties.csv')

## Doing the same but for the new file

In [103]:
# Read csv file to dataframe
polling_site_new =  pd.read_csv('data/polling_site_data_new/original.csv')

polling_site_new

Unnamed: 0,election_date,state,county_name,jurisdiction,jurisdiction_type,precinct_id,precinct_name,polling_place_id,location_type,name,address,notes,source,source_date,source_notes
0,2020-11-03,GA,Appling,Appling,county,2,,1012,election_day,LIONS CLUB BLDG/ AT FAIR GROUNDS,"245 INDUSTRIAL DR, BAXLEY, GA 31513",OTHER,ORR,2020-10-19,
1,2020-11-03,GA,Appling,Appling,county,1C,,1018,election_day,1ST ASSEMBLY OF GOD CHURCH,"3397 HATCH PKY N, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
2,2020-11-03,GA,Appling,Appling,county,1B,,1001,election_day,ALTAMAHA FIRE STATION,"392 ALTAMAHA SCHOOL RD, BAXLEY, GA 31513",County Building,ORR,2020-10-19,
3,2020-11-03,GA,Appling,Appling,county,4D,,1006,election_day,BAX CH OF GOD/FELLOWSHIP HALL,"353 BLACKSHEAR HWY, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
4,2020-11-03,GA,Appling,Appling,county,5A,,1002,election_day,BAXLEY CITY GYM,"252 W. PARKER ST., BAXLEY, GA 31513",County Building,ORR,2020-10-19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,2020-11-03,GA,Worth,Worth,county,5,,159014,election_day,SHINGLER FIRE STATION,"126 SHINGLER LITTLE RIVER RD, POULAN, GA 31781",,ORR,2020-10-19,
2674,2020-11-03,GA,Worth,Worth,county,4,,159022,election_day,SUMNER MUNICIPAL COMPLEX,"702 WALNUT ST, SUMNER, GA 31789",,ORR,2020-10-19,
2675,2020-11-03,GA,Worth,Worth,county,14,,159016,election_day,VOLUNTEER FIRE DEPARTMENT,"6104 WILLOW/BIRCH RD, ALBANY, GA 31705",,ORR,2020-10-19,
2676,2020-11-03,GA,Worth,Worth,county,9,,159017,election_day,WARWICK COMM CENTER,"111 DOGWOOD ST SW, WARWICK, GA 31796",,ORR,2020-10-19,


In [104]:
polling_site_new['county_name'].value_counts()

Fulton           394
Dekalb           193
Gwinnett         156
Cobb             145
Chatham           92
Richmond          68
Clayton           65
Columbia          47
Cherokee          42
Henry             37
Fayette           36
Hall              31
Bibb              31
Carroll           28
Dougherty         28
Coweta            26
Floyd             25
Douglas           25
Muscogee          25
Clarke            24
Whitfield         23
Newton            22
Walton            21
Thomas            20
Glynn             20
Spalding          20
Forsyth           20
Paulding          19
Colquitt          19
Effingham         17
Houston           17
Bartow            17
Rockdale          16
Burke             16
Laurens           16
Dodge             16
Bulloch           16
Worth             15
Baldwin           14
Troup             14
Camden            14
Monroe            14
Meriwether        14
Banks             13
Lowndes           13
Gilmer            13
Chattooga         13
Liberty      

In [105]:
counties_new = polling_site_new['county_name'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties_new)


            County  Number of polling sites
0           Fulton                      394
1           Dekalb                      193
2         Gwinnett                      156
3             Cobb                      145
4          Chatham                       92
5         Richmond                       68
6          Clayton                       65
7         Columbia                       47
8         Cherokee                       42
9            Henry                       37
10         Fayette                       36
11            Hall                       31
12            Bibb                       31
13         Carroll                       28
14       Dougherty                       28
15          Coweta                       26
16           Floyd                       25
17         Douglas                       25
18        Muscogee                       25
19          Clarke                       24
20       Whitfield                       23
21          Newton              

In [93]:
counties_new.columns = counties_new.columns.map(str)
counties_new = counties_new.astype({'County': str, 'Number of polling sites': int})

print(counties_new)

            County  Number of polling sites
0           Fulton                      394
1           Dekalb                      193
2         Gwinnett                      156
3             Cobb                      145
4          Chatham                       92
5         Richmond                       68
6          Clayton                       65
7         Columbia                       47
8         Cherokee                       42
9            Henry                       37
10         Fayette                       36
11            Hall                       31
12            Bibb                       31
13         Carroll                       28
14       Dougherty                       28
15          Coweta                       26
16           Floyd                       25
17         Douglas                       25
18        Muscogee                       25
19          Clarke                       24
20       Whitfield                       23
21          Newton              

In [106]:
race['Area Name'] = race['Area Name'].str.split(' ').str[0]

race.head()

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others,Population Density:Hispanic,Population Density: White,Population Density: Black,Population Density: Asian,Population Density: Mixed,Population Density: Others
0,Appling,0500000US13001,18444,1825,12674,3339,123,417,66,0.098948,0.687161,0.181034,0.006669,0.022609,0.003578
1,Atkinson,0500000US13003,8286,2048,4801,1208,12,167,50,0.247164,0.579411,0.145788,0.001448,0.020154,0.006034
2,Bacon,0500000US13005,11140,875,8103,1747,40,335,40,0.078546,0.727379,0.156822,0.003591,0.030072,0.003591
3,Baker,0500000US13007,2876,143,1514,1128,18,70,3,0.049722,0.526426,0.392211,0.006259,0.024339,0.001043
4,Baldwin,0500000US13009,43799,1139,22432,18318,599,1027,284,0.026005,0.512158,0.418229,0.013676,0.023448,0.006484


In [107]:
total_population = race[['Area Name', 'Total']].copy()
total_population.rename(columns={'Area Name': 'County'}, inplace=True)


In [108]:
total_population

Unnamed: 0,County,Total
0,Appling,18444
1,Atkinson,8286
2,Bacon,11140
3,Baker,2876
4,Baldwin,43799
5,Banks,18035
6,Barrow,83505
7,Bartow,108901
8,Ben,17194
9,Berrien,18160


In [109]:
counties_new = counties_new.merge(total_population, on='County', how='inner')
counties_new.head()

Unnamed: 0,County,Number of polling sites,Total
0,Fulton,394,1066710
1,Gwinnett,156,957062
2,Cobb,145,766149
3,Chatham,92,295291
4,Richmond,68,206607


In [110]:
for ind, row in counties_new.iterrows():
  counties_new.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total']

In [111]:
counties_new

Unnamed: 0,County,Number of polling sites,Total,Polling Site Density
0,Fulton,394,1066710,0.000369
1,Gwinnett,156,957062,0.000163
2,Cobb,145,766149,0.000189
3,Chatham,92,295291,0.000312
4,Richmond,68,206607,0.000329
5,Clayton,65,297595,0.000218
6,Columbia,47,156010,0.000301
7,Cherokee,42,266620,0.000158
8,Henry,37,240712,0.000154
9,Fayette,36,119194,0.000302


In [112]:
counties_new.to_csv('data/polling_site_data_new/clean.csv')

## Viewing the shapefile and checking if merging works for precinct

In [None]:
precinct = pd.read_csv("data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv")

precinct.head(10)

In [None]:
shapefile = gpd.read_file("data/test/cb_2020_13_vtd_500k.shp")

shapefile.head(10)

In [None]:
merged = pd.merge(precinct, shapefile, left_on=precinct['id'], right_on=shapefile['AFFGEOID20'], how="left")

In [None]:
merged

In [None]:

# set the value column that will be visualised
variable = 'Hispanic'
# set the range for the choropleth values
vmin, vmax = 0, 100
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(30, 10))
# remove the axis
ax.axis('off')
# add a title and annotation
ax.set_title('White Choropleth Map', fontdict={'fontsize': '25', 'fontweight' : '3'})
#ax.annotate('Source: Wikipedia - https://en.wikipedia.org/wiki/Provinces_of_Indonesia', xy=(0.6, .05), xycoords='figure fraction', fontsize=12, color='#555555')
# Create colorbar legend
sm = plt.cm.ScalarMappable(cmap='Blues', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm.set_array([]) # or alternatively sm._A = []. Not sure why this step is necessary, but many recommends it
# add the colorbar to the figure
fig.colorbar(sm)
# create map
merged.plot(column=variable, cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8')


In [None]:
print(race[race['Area Name'].str.contains('Fulton')])
print(race[race['Area Name'].str.contains('Columbia')])

The names of the area are different. Will remove ',', 'County' and ',Georgia from the race data. Then will create a new column joining the columns 'prec_shp' and 'locality'.

In [None]:
def remove_string(string):
    race['Area Name'] = race['Area Name'].str.replace(str(string), '')
    race['Area Name'] = race['Area Name'].str.strip()

remove_string(',')
remove_string('County')
remove_string('Georgia')

In [None]:
shapefile['Area Name'] = shapefile['prec_shp'].str.upper() + ' ' + shapefile['locality'].str.capitalize()

shapefile['Area Name'].head(5)

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])

In [None]:
shapefile[shapefile['Area Name'].str.contains('Appling')]

In [None]:
merged = pd.merge(race, shapefile, left_on=race['Area Name'].str.casefold(), right_on=shapefile['Area Name'].str.casefold(), how="left")

merged.head(5)

In [None]:
merged_isna = merged[merged.isna().any(axis=1)]

exogenous random, not related to the research

In [None]:
print(merged_isna)