In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd

## Cleaning the racial data

In [None]:
# Read csv file to dataframe
race =  pd.read_csv('data/race_county_data/original_georgia_race_data.csv', skiprows = 1)

pd.options.display.width=None
pd.set_option('max_row', None)
pd.set_option('display.max_rows', race.shape[0] + 1)
pd.set_option('display.expand_frame_repr', False)

In [None]:
print(race.columns)
print(race.shape)

There's a weird space at the start of each column name so we can remove that. We will combine the columns with the data on the number of people that identify their race as 'American Indian and Alaska Native alone, Native Hawaiian' and 'Other Pacific Islander alone' and 'Some Other Race alone' into an 'Others' column. We wil also combine the data on the number of people that identify as more than one race as 'Mixed'.

In [None]:
race.columns = race.columns.str.lstrip()

race.columns = race.columns.map(str)

race.rename(columns={'!!Total:!!Not Hispanic or Latino:!!Population of two or more races:':'Mixed'}, inplace = True)

race['Others'] = race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alone'] + race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone'] + race['!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Some Other Race alone']



Then we create a new datafram with only the columns we want: 'Total', 'White', 'Black', 'Asian' and the previously created columns of 'Others' and 'Mixed. We will also rename the columns accordingly.

In [None]:
# Create a new dataframe with the desired columns
race = race[['Geographic Area Name', 'id', '!!Total:', '!!Total:!!Hispanic or Latino', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!White alone', '!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Black or African American alone','!!Total:!!Not Hispanic or Latino:!!Population of one race:!!Asian alone', 'Mixed', 'Others' ]].copy()

# Rename the columns
race.columns = ['Area Name', 'id', 'Total', 'Hispanic', 'White', 'Black', 'Asian', 'Mixed', 'Others']

In [None]:
print(race.head())

In [None]:
print(race.head(20))


Creating rows with density. 

In [None]:
for ind, row in race.iterrows():
  race.loc[ind,"Population Density:Hispanic"]= row ['Hispanic']/row['Total']
  race.loc[ind,"Population Density: White"] = row ['White']/row['Total']
  race.loc[ind,"Population Density: Black"]= row['Black']/row['Total']
  race.loc[ind,"Population Density: Asian"]= row ['Asian']/row['Total']
  race.loc[ind,"Population Density: Mixed"]=row['Mixed']/row["Total"]
  race.loc[ind,"Population Density: Others"]=row["Others"]/row["Total"]

We get the error that we have a division by 0.  159 counties

In [None]:
race.loc[race['Total']==0]

In [None]:
race.tail(20)

Downloading the dataframes into new files.

In [None]:
race.to_csv('data/race_county_data/cleaned_georgia_race_county.csv')


In [None]:
print(race.shape)

# Preparing the polling site data

Had to use this: https://stackoverflow.com/questions/45690830/reading-in-csv-file-to-pandas-fails

In [None]:
# Read csv file to dataframe
polling_site =  pd.read_csv('data/polling_site_data/original_polling_site_data_2.csv', encoding="utf-16", sep='\t')


In [None]:
print(polling_site)

In [None]:
polling_site['County'].value_counts()

In [None]:
counties = polling_site['County'].value_counts().rename_axis('County').reset_index(name='Number of polling sites')
print(counties)



In [None]:
counties.columns = counties.columns.map(str)
counties = counties.astype({'County': str, 'Number of polling sites': int})

counties['County'] = counties['County'].str.capitalize()

print(counties)

Now calculating the density

In [22]:
race['Total'] = pd.Series(polling_site['Total Population'])

KeyError: 'Total Population'

In [None]:
counties.to_csv('data/polling_site_data/polling_sites_in_counties.csv')

## Viewing the shapefile and checking if merging works

In [None]:
shapefile = gpd.read_file("data/georgia_shapefiles/2018Precincts.shp")

shapefile.head()

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])
print(race[race['Area Name'].str.contains('Columbia')])

The names of the area are different. Will remove ',', 'County' and ',Georgia from the race data. Then will create a new column joining the columns 'prec_shp' and 'locality'.

In [None]:
def remove_string(string):
    race['Area Name'] = race['Area Name'].str.replace(str(string), '')
    race['Area Name'] = race['Area Name'].str.strip()

remove_string(',')
remove_string('County')
remove_string('Georgia')

In [None]:
shapefile['Area Name'] = shapefile['prec_shp'].str.upper() + ' ' + shapefile['locality'].str.capitalize()

shapefile['Area Name'].head(5)

In [None]:
print(race[race['Area Name'].str.contains('Fulton')])

In [None]:
shapefile[shapefile['Area Name'].str.contains('Appling')]

In [None]:
merged = pd.merge(race, shapefile, left_on=race['Area Name'].str.casefold(), right_on=shapefile['Area Name'].str.casefold(), how="left")

merged.head(5)

In [None]:
merged_isna = merged[merged.isna().any(axis=1)]

exogenous random, not related to the research

In [None]:
print(merged_isna)