In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Scatter plots for data exploration

In [None]:
race =  pd.read_csv('data/race_county_data/cleaned_georgia_race_county.csv', index_col = 0)
polling_site = pd.read_csv('data/polling_site_data/2020_clean_county_densities.csv', index_col = 0)
polling_site_changes = pd.read_csv('data/polling_site_data/polling_sites_in_years_changes.csv', index_col = 0)

In [None]:
race.head()

In [None]:
polling_site.head()

In [None]:
polling_site_changes.head()

Since the polling site and polling site changes data does not include race, the dataframes have to be merged.

Checking the number of rows to ensure that all rows are preserved after the merger.

In [None]:
print(race.shape)
print(polling_site.shape)
print(polling_site_changes.shape)

Since the county names in the race data are have 'County, Georgia' while the polling site data only has the county name, this part of the string is removed and all data is converted to titles.

In [None]:
race['Area Name'] = race['Area Name'].str.replace(' County, Georgia','')
race['Area Name'] = race['Area Name'].str.title()

polling_site['County'] = polling_site['County'].str.title()
polling_site_changes['County'] = polling_site['County'].str.title()

In [None]:
polling_site = pd.merge(race, polling_site, how="inner", left_on=['Area Name'], right_on=['County'])
polling_site_changes = pd.merge(race, polling_site_changes, how="inner", left_on=['Area Name'], right_on=['County'])

In [None]:
polling_site.head()

In [None]:
polling_site_changes.head()

In [None]:
print(polling_site.shape)
print(polling_site_changes.shape)

It was found that the file kept getting cropped when the file was saved with .savefig(). Thus, the (documentation)[https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html] was consulted and the paramenter bbox_inches was added so that the full figure is saved.

In [None]:
# Function to create scatter plots of polling site quantity
def scatterplot_quantity(race):
    polling_site.plot.scatter(x='Population Density: ' + str(race), y='2020 Quantity', s = 10, c = 'brown')
    plt.ylabel('2020 Polling \nQuantity', rotation=0, ha="right")
    plt.savefig('images/scatter_plot_quantity' +str(race), bbox_inches='tight')

# Function to create scatter plots of polling site density
def scatterplot_density(race):
    polling_site.plot.scatter(x='Population Density: ' + str(race), y='Polling Site Density', s = 10, c = 'brown', )
    plt.ylabel('Polling Site \nDensity', rotation=0, ha="right")
    plt.savefig('images/scatter_plot_density_' +str(race), bbox_inches='tight')

# Function to create scatter plots of changes in polling site by county
def scatterplot_changes(race, year):
    polling_site_changes.plot.scatter(x='Population Density: ' + str(race), y='Delta ' + str(year), s = 10, c = 'brown', )
    plt.ylabel('Changes in \nthe Number of \nPolling sites', rotation=0, ha="right")
    plt.savefig('images/scatter_plot_changes_' +str(year) + '_' + str(race), bbox_inches='tight')


In [None]:
race = ['White', 'Black', 'Hispanic', 'Asian', 'White', 'Others', 'Mixed']
years = ['16-18', '18-20', '16-20']

for i in race:
    scatterplot_quantity(i)
    scatterplot_density(i)
    for j in years:
        scatterplot_changes(i, j)