In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd

## Cleaning the racial data by county level

In [None]:
# Read csv file to dataframe
race_county =  pd.read_csv('data/race_county_data/original_georgia_race_data.csv', skiprows = 1)

pd.options.display.width=None
pd.set_option('max_row', None)
pd.set_option('display.max_rows', race_county.shape[0] + 1)
pd.set_option('display.expand_frame_repr', False)

race_county

In [None]:
print(race_county.columns)
print(race_county.shape)

There's a weird space at the start of each column name so we can remove that. We will combine the columns with the data on the number of people that identify their race as 'American Indian and Alaska Native alone, Native Hawaiian' and 'Other Pacific Islander alone' and 'Some Other Race alone' into an 'Others' column. We wil also combine the data on the number of people that identify as more than one race as 'Mixed'.

In [None]:
race_county.columns = race_county.columns.str.lstrip()

race_county.columns = race_county.columns.map(str)

race_county.rename(columns={'!!Total:!!Not Hispanic or Latino:!!Population of two or more races:':'Mixed'}, inplace = True)

race_county['Others'] = race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone'] + race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone'] + race_county['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone']



Then we create a new datafram with only the columns we want: 'Total', 'White', 'Black', 'Asian' and the previously created columns of 'Others' and 'Mixed. We will also rename the columns accordingly.

In [None]:
# Create a new dataframe with the desired columns
race_county = race_county[['Geographic Area Name', 'id', '!!Total:', '!!Total:!!Hispanic or Latino', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone','!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone', 'Mixed', 'Others' ]].copy()

# Rename the columns
race_county.columns = ['Area Name', 'id', 'Total', 'Hispanic', 'White', 'Black', 'Asian', 'Mixed', 'Others']

In [None]:
print(race_county.head())

In [None]:
race_county.head(20)


Creating rows with density. 

In [None]:
for ind, row in race_county.iterrows():
  race_county.loc[ind,"Population Density:Hispanic"]= row ['Hispanic']/row['Total']
  race_county.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_county.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_county.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_county.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_county.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race_county.tail(20)

Downloading the dataframes into new files.

In [None]:
race_county.to_csv('data/race_county_data/cleaned_georgia_race_county.csv')


# Cleaning the racial data by precinct level

In [None]:
race_precinct =  pd.read_csv('data/race_precinct_data/cleaned_georgia_race_precinct.csv', index_col=0)

race_precinct

In [None]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

so we see which areas have a population total of 0

In [None]:
race_precinct.loc[race_precinct['Total']==0]

In [None]:
race_precinct.drop(race_precinct.loc[race_precinct['Total'] == 0].index, inplace=True)

In [None]:
for ind, row in race_precinct.iterrows():
  race_precinct.loc[ind,"Population Density: Hispanic"]= row ['Hispanic']/row['Total']
  race_precinct.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race_precinct.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race_precinct.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race_precinct.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race_precinct.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

In [None]:
race_precinct

In [None]:
race_precinct.to_csv('data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv')

# Preparing the polling site data (old data)

Had to use this: https://stackoverflow.com/questions/45690830/reading-in-csv-file-to-pandas-fails

In [None]:
# Read csv file to dataframe
polling_site =  pd.read_csv('data/polling_site_data/original_polling_site_data_2.csv', encoding="utf-16", sep='\t')


In [None]:
polling_site.shape

In [None]:
polling_site['County'].value_counts()

In [None]:
counties = polling_site['County'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties)



In [None]:
counties.columns = counties.columns.map(str)
counties = counties.astype({'County': str, 'Number of polling sites': int})

counties['County'] = counties['County'].str.capitalize()

print(counties)

Now calculating the density

In [None]:
counties['Total Population'] = pd.Series(race_county['Total'])
counties.head()

In [None]:
for ind, row in counties.iterrows():
  counties.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total Population']

In [None]:
counties.head()

In [None]:
counties.to_csv('data/polling_site_data/polling_sites_in_counties.csv')

# Preparing the polling site data (new data)

In [None]:
# Read csv file to dataframe
polling_site_new =  pd.read_csv('data/polling_site_data_new/original.csv')

polling_site_new

In [None]:
polling_site_new['county_name'].value_counts()

In [None]:
counties_new = polling_site_new['county_name'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties_new)


In [None]:
counties_new.columns = counties_new.columns.map(str)
counties_new = counties_new.astype({'County': str, 'Number of polling sites': int})

print(counties_new)

In [None]:
race_county['Area Name'] = race_county['Area Name'].str.split(' ').str[0]

race_county.head()

In [None]:
total_population = race_county[['Area Name', 'Total']].copy()
total_population.rename(columns={'Area Name': 'County'}, inplace=True)


In [None]:
total_population

In [None]:
counties_new = counties_new.merge(total_population, on='County', how='inner')
counties_new.head()

In [None]:
for ind, row in counties_new.iterrows():
  counties_new.loc[ind,"Polling Site Density"]= row ['Number of polling sites']/row['Total']

In [None]:
counties_new

In [None]:
counties_new.to_csv('data/polling_site_data_new/clean.csv')

## Finding out which precincts have polling sites

In [None]:
polling_site_precinct =  pd.read_csv('data/polling_site_data_new/original.csv')
race_county =  pd.read_csv('data/race_county_data/cleaned_georgia_race_county.csv', index_col = 0)

polling_site_precinct.columns = polling_site_precinct.columns.map(str)

In [None]:
polling_site_precinct

In [None]:
race_county

In [None]:
race_county.columns = race_county.columns.map(str)

In [None]:
race_county

In [None]:
race_copy = race_county[['id', 'Area Name']]
race_copy.head()

In [None]:
race_copy['id'] = race_copy['id'].str[7:]

In [None]:
race_copy['Area Name'] = race_copy['Area Name'].str.replace(' County, Georgia','')


In [None]:
race_copy.head()

In [None]:
merged = pd.merge(polling_site_precinct, race_copy, how="inner", on=None, left_on=['county_name'], right_on=['Area Name'])


In [None]:
polling_site_precinct = merged[['id', 'Area Name', 'precinct_id']]

In [None]:
polling_site_precinct.head(20)

In [17]:
polling_site_precinct["precinct_id"]= polling_site_precinct["precinct_id"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct["precinct_id"]= polling_site_precinct["precinct_id"].astype(str)


In [None]:
polling_site_precinct.info(verbose=True)

In [18]:

polling_site_precinct['precinct_id'] = polling_site_precinct['precinct_id'].str.zfill(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct['precinct_id'] = polling_site_precinct['precinct_id'].str.zfill(6)


In [None]:
polling_site_precinct.head(20)

In [19]:
polling_site_precinct['Geographic Id'] = polling_site_precinct['id'] + polling_site_precinct['precinct_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct['Geographic Id'] = polling_site_precinct['id'] + polling_site_precinct['precinct_id']


In [20]:
polling_site_precinct['Geographic Id'] = '7000000' + polling_site_precinct['Geographic Id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polling_site_precinct['Geographic Id'] = '7000000' + polling_site_precinct['Geographic Id'].astype(str)


In [21]:
polling_site_precinct = polling_site_precinct[['Area Name', 'Geographic Id']]

In [22]:
polling_site_precinct.rename({'Area Name': 'County', 'Geographic Id': 'Id'}, inplace = True)

In [23]:
polling_site_precinct

Unnamed: 0,Area Name,Geographic Id
0,Appling,7000000US13001000002
1,Appling,7000000US1300100001C
2,Appling,7000000US1300100001B
3,Appling,7000000US1300100004D
4,Appling,7000000US1300100005A
...,...,...
2465,Worth,7000000US13321000005
2466,Worth,7000000US13321000004
2467,Worth,7000000US13321000014
2468,Worth,7000000US13321000009


In [24]:
polling_site_precinct.to_csv('data/polling_site_data_new/clean_precincts.csv')

# (old test) Viewing the shapefile and checking if merging works for precinct

In [None]:
precinct = pd.read_csv("data/race_precinct_data/cleaned_georgia_race_precinct_densities.csv")

precinct.head(10)

In [None]:
shapefile = gpd.read_file("data/test/cb_2020_13_vtd_500k.shp")

shapefile.head(10)

In [None]:
merged = pd.merge(precinct, shapefile, left_on=precinct['id'], right_on=shapefile['AFFGEOID20'], how="left")

In [None]:
merged

In [None]:

# set the value column that will be visualised
variable = 'Hispanic'
# set the range for the choropleth values
vmin, vmax = 0, 100
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(30, 10))
# remove the axis
ax.axis('off')
# add a title and annotation
ax.set_title('White Choropleth Map', fontdict={'fontsize': '25', 'fontweight' : '3'})
#ax.annotate('Source: Wikipedia - https://en.wikipedia.org/wiki/Provinces_of_Indonesia', xy=(0.6, .05), xycoords='figure fraction', fontsize=12, color='#555555')
# Create colorbar legend
sm = plt.cm.ScalarMappable(cmap='Blues', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm.set_array([]) # or alternatively sm._A = []. Not sure why this step is necessary, but many recommends it
# add the colorbar to the figure
fig.colorbar(sm)
# create map
merged.plot(column=variable, cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8')


In [None]:
print(race[race['Area Name'].str.contains('Fulton')])
print(race[race['Area Name'].str.contains('Columbia')])

The names of the area are different. Will remove ',', 'County' and ',Georgia from the race data. Then will create a new column joining the columns 'prec_shp' and 'locality'.

In [None]:
def remove_string(string):
    race['Area Name'] = race['Area Name'].str.replace(str(string), '')
    race['Area Name'] = race['Area Name'].str.strip()

remove_string(',')
remove_string('County')
remove_string('Georgia')

In [None]:
shapefile['Area Name'] = shapefile['prec_shp'].str.upper() + ' ' + shapefile['locality'].str.capitalize()

shapefile['Area Name'].head(5)

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])

In [None]:
shapefile[shapefile['Area Name'].str.contains('Appling')]

In [None]:
merged = pd.merge(race, shapefile, left_on=race['Area Name'].str.casefold(), right_on=shapefile['Area Name'].str.casefold(), how="left")

merged.head(5)

In [None]:
merged_isna = merged[merged.isna().any(axis=1)]

exogenous random, not related to the research

In [None]:
print(merged_isna)

In [None]:
i = 0
j = 0
polling_site_precinct["county_id"] = ""

for i in range (0, len(polling_site_precinct.index)):
    for j in range (0, len(race_copy.index)):
        if polling_site_precinct.loc[i]['county_name'] == race_copy[j]['Geographic Area Name']:
            polling_site_precinct[i]['county_id'] = race_copy[i]['id']


In [None]:
for i in range (0, len(polling_site_precinct.index)):
    polling_site_precinct.at[int(i), 'precinct_id'] = str(polling_site_precinct.at[int(i), 'precinct_id']).rjust(6,"0")