In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd

## Cleaning the racial data by county level

In [None]:
# Read csv file to dataframe
race_county =  pd.read_csv('data/race_county_data/original_georgia_race_data.csv', skiprows = 1)

pd.options.display.width=None
pd.set_option('max_row', None)
pd.set_option('display.max_rows', race_county.shape[0] + 1)
pd.set_option('display.expand_frame_repr', False)

race_county

In [None]:
print(race_county.columns)
print(race_county.shape)

There's a weird space at the start of each column name so we can remove that. We will combine the columns with the data on the number of people that identify their race as 'American Indian and Alaska Native alone, Native Hawaiian' and 'Other Pacific Islander alone' and 'Some Other Race alone' into an 'Others' column. We wil also combine the data on the number of people that identify as more than one race as 'Mixed'.

In [8]:
race_county.columns = race_county.columns.str.lstrip()

race_county.columns = race_county.columns.map(str)

race_county.rename(columns={'!!Total:!!Not Hispanic or Latino:!!Population of two or more races:':'Mixed'}, inplace = True)

race_county['Others'] = race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone'] + race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone'] + race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone']



Then we create a new datafram with only the columns we want: 'Total', 'White', 'Black', 'Asian' and the previously created columns of 'Others' and 'Mixed. We will also rename the columns accordingly.

In [9]:
# Create a new dataframe with the desired columns
race_county = race_county[['Geographic Area Name', 'id', '!!Total:', '!!Total:!!Hispanic or Latino', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone','!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone', 'Mixed', 'Others' ]].copy()

# Rename the columns
race_county.columns = ['Area Name', 'id', 'Total', 'Hispanic', 'White', 'Black', 'Asian', 'Mixed', 'Others']

In [None]:
print(race_county.head())

In [None]:
race_county.head(20)


Creating rows with density. 

In [10]:
for ind, row in race_county.iterrows():
  race_county.loc[ind,"Population Density:Hispanic"]= row ['Hispanic']/row['Total']
  race_county.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_county.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_county.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_county.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_county.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race_county.tail(20)

Downloading the dataframes into new files.

In [None]:
race_county.to_csv('data/race_county_data/cleaned_georgia_race_county.csv')


# Cleaning the racial data by precinct level

In [22]:
race_precinct =  pd.read_csv('data/race_precinct_data/cleaned_georgia_race_precinct.csv', index_col=0)

race_precinct

Unnamed: 0,Area Name,id,Total,Hispanic,White,Black,Asian,Mixed,Others
1,"2, Appling County, Georgia",7000000US13001000002,3563,403,1215,1838,10,81,16
2,"1B, Appling County, Georgia",7000000US1300100001B,1834,76,1575,109,3,63,8
3,"1C, Appling County, Georgia",7000000US1300100001C,1538,116,1242,150,5,25,0
4,"3C, Appling County, Georgia",7000000US1300100003C,2515,263,1528,608,39,69,8
5,"4B, Appling County, Georgia",7000000US1300100004B,1321,62,1147,59,5,43,5
...,...,...,...,...,...,...,...,...,...
2694,"MINTON, Worth County, Georgia",7000000US13321000011,731,10,668,34,2,13,4
2695,"BRIDGEBORO, Worth County, Georgia",7000000US13321000012,1802,14,1554,160,9,47,18
2696,"COUNTY LINE, Worth County, Georgia",7000000US13321000014,1253,20,1122,94,2,13,2
2697,"ISABELLA, Worth County, Georgia",7000000US13321000015,1830,46,1589,125,11,45,14


In [None]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

so we see which areas have a population total of 0

In [None]:
race_precinct.loc[race_precinct['Total']==0]

In [None]:
race_precinct.drop(race_precinct.loc[race_precinct['Total'] == 0].index, inplace=True)

In [None]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race_precinct

In [None]:
race_precinct.to_csv('data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv')

# Preparing the polling site data (old data)

Had to use this: https://stackoverflow.com/questions/45690830/reading-in-csv-file-to-pandas-fails

In [11]:
# Read csv file to dataframe
polling_site =  pd.read_csv('data/polling_site_data/original_polling_site_data_2.csv', encoding="utf-16", sep='\t')


In [12]:
polling_site.shape

(2286, 8)

In [None]:
polling_site['County'].value_counts()

In [None]:
counties = polling_site['County'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties)



In [None]:
counties.columns = counties.columns.map(str)
counties = counties.astype({'County': str, 'Number of polling sites': int})

counties['County'] = counties['County'].str.capitalize()

print(counties)

Now calculating the density

In [None]:
counties['Total Population'] = pd.Series(race_county['Total'])
counties.head()

In [None]:
for ind, row in counties.iterrows():
  counties.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total Population']

In [None]:
counties.head()

In [None]:
counties.to_csv('data/polling_site_data/polling_sites_in_counties.csv')

# Preparing the polling site data (new data)

In [None]:
# Read csv file to dataframe
polling_site_new =  pd.read_csv('data/polling_site_data_new/original.csv')

polling_site_new

In [None]:
polling_site_new['county_name'].value_counts()

In [None]:
counties_new = polling_site_new['county_name'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties_new)


In [None]:
counties_new.columns = counties_new.columns.map(str)
counties_new = counties_new.astype({'County': str, 'Number of polling sites': int})

print(counties_new)

In [None]:
race_county['Area Name'] = race_county['Area Name'].str.split(' ').str[0]

race_county.head()

In [None]:
total_population = race_county[['Area Name', 'Total']].copy()
total_population.rename(columns={'Area Name': 'County'}, inplace=True)


In [None]:
total_population

In [None]:
counties_new = counties_new.merge(total_population, on='County', how='inner')
counties_new.head()

In [None]:
for ind, row in counties_new.iterrows():
  counties_new.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total']

In [None]:
counties_new

In [None]:
counties_new.to_csv('data/polling_site_data_new/clean.csv')

## Finding out which precincts have polling sites

In [73]:
polling_site_precinct =  pd.read_csv('data/polling_site_data_new/original.csv')

polling_site_precinct.columns = polling_site_precinct.columns.map(str)

In [74]:
polling_site_precinct

Unnamed: 0,election_date,state,county_name,jurisdiction,jurisdiction_type,precinct_id,precinct_name,polling_place_id,location_type,name,address,notes,source,source_date,source_notes
0,2020-11-03,GA,Appling,Appling,county,2,,1012,election_day,LIONS CLUB BLDG/ AT FAIR GROUNDS,"245 INDUSTRIAL DR, BAXLEY, GA 31513",OTHER,ORR,2020-10-19,
1,2020-11-03,GA,Appling,Appling,county,1C,,1018,election_day,1ST ASSEMBLY OF GOD CHURCH,"3397 HATCH PKY N, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
2,2020-11-03,GA,Appling,Appling,county,1B,,1001,election_day,ALTAMAHA FIRE STATION,"392 ALTAMAHA SCHOOL RD, BAXLEY, GA 31513",County Building,ORR,2020-10-19,
3,2020-11-03,GA,Appling,Appling,county,4D,,1006,election_day,BAX CH OF GOD/FELLOWSHIP HALL,"353 BLACKSHEAR HWY, BAXLEY, GA 31513",CHURCH,ORR,2020-10-19,
4,2020-11-03,GA,Appling,Appling,county,5A,,1002,election_day,BAXLEY CITY GYM,"252 W. PARKER ST., BAXLEY, GA 31513",County Building,ORR,2020-10-19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,2020-11-03,GA,Worth,Worth,county,5,,159014,election_day,SHINGLER FIRE STATION,"126 SHINGLER LITTLE RIVER RD, POULAN, GA 31781",,ORR,2020-10-19,
2674,2020-11-03,GA,Worth,Worth,county,4,,159022,election_day,SUMNER MUNICIPAL COMPLEX,"702 WALNUT ST, SUMNER, GA 31789",,ORR,2020-10-19,
2675,2020-11-03,GA,Worth,Worth,county,14,,159016,election_day,VOLUNTEER FIRE DEPARTMENT,"6104 WILLOW/BIRCH RD, ALBANY, GA 31705",,ORR,2020-10-19,
2676,2020-11-03,GA,Worth,Worth,county,9,,159017,election_day,WARWICK COMM CENTER,"111 DOGWOOD ST SW, WARWICK, GA 31796",,ORR,2020-10-19,


In [75]:
race_county.columns = race_county.columns.map(str)

In [None]:
race_county

In [77]:
race_copy = race_county[['id', 'Area Name']]
race_copy.head()

Unnamed: 0,id,Area Name
0,0500000US13001,"Appling County, Georgia"
1,0500000US13003,"Atkinson County, Georgia"
2,0500000US13005,"Bacon County, Georgia"
3,0500000US13007,"Baker County, Georgia"
4,0500000US13009,"Baldwin County, Georgia"


In [78]:
race_copy['id'] = race_copy['id'].str[7:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_copy['id'] = race_copy['id'].str[7:]


In [79]:
race_copy['Area Name'] = race_copy['Area Name'].str.replace(' County, Georgia','')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_copy['Area Name'] = race_copy['Area Name'].str.replace(' County, Georgia','')


In [80]:
race_copy.head()

Unnamed: 0,id,Area Name
0,US13001,Appling
1,US13003,Atkinson
2,US13005,Bacon
3,US13007,Baker
4,US13009,Baldwin


In [81]:
merged = pd.merge(polling_site_precinct, race_copy, how="inner", on=None, left_on=['county_name'], right_on=['Area Name'])


In [82]:
polling_site_precinct = merged[['id', 'Area Name', 'precinct_id']]

In [34]:
polling_site_precinct.head(20)

Unnamed: 0,id,Area Name,precinct_id
0,US13001,Appling,2
1,US13001,Appling,1C
2,US13001,Appling,1B
3,US13001,Appling,4D
4,US13001,Appling,5A
5,US13001,Appling,5B
6,US13001,Appling,3C
7,US13001,Appling,3A1
8,US13001,Appling,4B
9,US13003,Atkinson,0001


In [83]:
polling_site_precinct["precinct_id"]= polling_site_precinct["precinct_id"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct["precinct_id"]= polling_site_precinct["precinct_id"].astype(str)


In [55]:
polling_site_precinct.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2470 entries, 0 to 2469
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           2470 non-null   object
 1   Area Name    2470 non-null   object
 2   precinct_id  2470 non-null   object
dtypes: object(3)
memory usage: 141.7+ KB


In [84]:

polling_site_precinct['precinct_id'] = polling_site_precinct['precinct_id'].str.zfill(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct['precinct_id'] = polling_site_precinct['precinct_id'].str.zfill(6)


In [86]:
polling_site_precinct.head(20)

Unnamed: 0,id,Area Name,precinct_id
0,US13001,Appling,000002
1,US13001,Appling,00001C
2,US13001,Appling,00001B
3,US13001,Appling,00004D
4,US13001,Appling,00005A
5,US13001,Appling,00005B
6,US13001,Appling,00003C
7,US13001,Appling,0003A1
8,US13001,Appling,00004B
9,US13003,Atkinson,000001


In [87]:
polling_site_precinct['Geographic Id'] = polling_site_precinct['id'] + polling_site_precinct['precinct_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct['Geographic Id'] = polling_site_precinct['id'] + polling_site_precinct['precinct_id']


In [89]:
polling_site_precinct['Geographic Id'] = '70000000' + polling_site_precinct['Geographic Id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct['Geographic Id'] = '70000000' + polling_site_precinct['Geographic Id'].astype(str)


In [92]:
polling_site_precinct = polling_site_precinct[['Area Name', 'Geographic Id']]

In [94]:
polling_site_precinct.rename({'Area Name': 'County', 'Geographic Id': 'Id'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [95]:
polling_site_precinct

Unnamed: 0,Area Name,Geographic Id
0,Appling,70000000US13001000002
1,Appling,70000000US1300100001C
2,Appling,70000000US1300100001B
3,Appling,70000000US1300100004D
4,Appling,70000000US1300100005A
...,...,...
2465,Worth,70000000US13321000005
2466,Worth,70000000US13321000004
2467,Worth,70000000US13321000014
2468,Worth,70000000US13321000009


In [96]:
polling_site_precinct.to_csv('data/polling_site_data_new/clean_precincts.csv')

# (old test) Viewing the shapefile and checking if merging works for precinct

In [None]:
precinct = pd.read_csv("data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv")

precinct.head(10)

In [None]:
shapefile = gpd.read_file("data/test/cb_2020_13_vtd_500k.shp")

shapefile.head(10)

In [None]:
merged = pd.merge(precinct, shapefile, left_on=precinct['id'], right_on=shapefile['AFFGEOID20'], how="left")

In [None]:
merged

In [None]:

# set the value column that will be visualised
variable = 'Hispanic'
# set the range for the choropleth values
vmin, vmax = 0, 100
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(30, 10))
# remove the axis
ax.axis('off')
# add a title and annotation
ax.set_title('White Choropleth Map', fontdict={'fontsize': '25', 'fontweight' : '3'})
#ax.annotate('Source: Wikipedia - https://en.wikipedia.org/wiki/Provinces_of_Indonesia', xy=(0.6, .05), xycoords='figure fraction', fontsize=12, color='#555555')
# Create colorbar legend
sm = plt.cm.ScalarMappable(cmap='Blues', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm.set_array([]) # or alternatively sm._A = []. Not sure why this step is necessary, but many recommends it
# add the colorbar to the figure
fig.colorbar(sm)
# create map
merged.plot(column=variable, cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8')


In [None]:
print(race[race['Area Name'].str.contains('Fulton')])
print(race[race['Area Name'].str.contains('Columbia')])

The names of the area are different. Will remove ',', 'County' and ',Georgia from the race data. Then will create a new column joining the columns 'prec_shp' and 'locality'.

In [None]:
def remove_string(string):
    race['Area Name'] = race['Area Name'].str.replace(str(string), '')
    race['Area Name'] = race['Area Name'].str.strip()

remove_string(',')
remove_string('County')
remove_string('Georgia')

In [None]:
shapefile['Area Name'] = shapefile['prec_shp'].str.upper() + ' ' + shapefile['locality'].str.capitalize()

shapefile['Area Name'].head(5)

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])

In [None]:
shapefile[shapefile['Area Name'].str.contains('Appling')]

In [None]:
merged = pd.merge(race, shapefile, left_on=race['Area Name'].str.casefold(), right_on=shapefile['Area Name'].str.casefold(), how="left")

merged.head(5)

In [None]:
merged_isna = merged[merged.isna().any(axis=1)]

exogenous random, not related to the research

In [None]:
print(merged_isna)

In [None]:
i = 0
j = 0
polling_site_precinct["county_id"] = ""

for i in range (0, len(polling_site_precinct.index)):
    for j in range (0, len(race_copy.index)):
        if polling_site_precinct.loc[i]['county_name'] == race_copy[j]['Geographic Area Name']:
            polling_site_precinct[i]['county_id'] = race_copy[i]['id']


In [None]:
for i in range (0, len(polling_site_precinct.index)):
    polling_site_precinct.at[int(i), 'precinct_id'] = str(polling_site_precinct.at[int(i), 'precinct_id']).rjust(6,"0")