In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import plotly.graph_objects as go

#This is a comment - python does not try to execute it

#This tells python to draw the graphs "inline" - in the notebook
%matplotlib inline  
import matplotlib.pyplot as plt
plt.style.use('ggplot')


The following two functions were based on the example on https://plotly.com/python/box-plots/. The code was modified based on this site: https://stackoverflow.com/questions/54368158/add-multiple-text-labels-from-dataframe-columns-in-plotly in order to add county/precinct names as a label.

In [None]:
# function for distribution

def distribution(data_set, t, name_of_the_html_file, label):

  colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
            'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

  x_data = ['Hispanic','White','Black','Asian','Mixed','Others']

  fig = go.Figure()

  for xd, yd, cls in zip(x_data, data_set, colors):
          fig.add_trace(go.Box(
              y=yd,
              name=xd,
              boxpoints='all',
              jitter=0.5,
              whiskerwidth=0.2,
              fillcolor=cls,
              marker_size=2,
              line_width=2,
              boxmean=True,
              text = label)
          )

  fig.update_layout(
      title=t,
      yaxis=dict(
          autorange=True,
          showgrid=True,
          zeroline=True,
          dtick=0.1,
          gridcolor='rgb(255, 255, 255)',
          gridwidth=0.1,
          zerolinecolor='rgb(255, 255, 255)',
          zerolinewidth=1,
      ),
      margin=dict(
          l=40,
          r=30,
          b=80,
          t=100,
      ),
      paper_bgcolor='rgb(243, 243, 243)',
      plot_bgcolor='rgb(243, 243, 243)',
      showlegend=False
  )

  fig.write_html(name_of_the_html_file)
  fig.show()


In [None]:
# opening files and selecting the right columns:
race =  pd.read_csv('data/race_county_data/cleaned_georgia_race_county.csv')
racedenscounty = race[['Population Density: Hispanic','Population Density: White','Population Density: Black','Population Density: Asian','Population Density: Mixed','Population Density: Others']]

In [None]:
# racial density distribution county level
# Georgia State 2020 county level population density distribution by race
# list1 is a list for county level pop density
list1 = [racedenscounty['Population Density: Hispanic'],racedenscounty['Population Density: White'],racedenscounty['Population Density: Black'],racedenscounty['Population Density: Asian'], racedenscounty['Population Density: Mixed'], racedenscounty['Population Density: Others']]
name1 = 'Georgia State 2020 county level population density distribution by race'
distribution(list1, 'Georgia State 2020 county level population density distribution by race', 'html_files/county_race_density.html', race['Area Name'])


In [None]:
from enum import auto
# for non density distribution
def raw_distribution (x, t, name_of_the_html_file, title):
  N= 50

  y_data = x

  colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
            'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

  x_data = ['Hispanic','White','Black','Asian','Mixed','Others']

  fig = go.Figure()

  for xd, yd, cls in zip(x_data, y_data, colors):
          fig.add_trace(go.Box(
              y=yd,
              name=xd,
              boxpoints='all',
              jitter=0.5,
              whiskerwidth=0.2,
              fillcolor=cls,
              marker_size=2,
              line_width=1,
              boxmean=True,
              text = title)
          )

  fig.update_layout(
      title=t,
      yaxis=dict(
          autorange=True,
          showgrid=True,
          zeroline=True,
          gridcolor='rgb(255, 255, 255)',
          zerolinecolor='rgb(255, 255, 255)',
          zerolinewidth=2,
      ),
      margin=dict(
          l=40,
          r=30,
          b=80,
          t=100,
      ),
      paper_bgcolor='rgb(243, 243, 243)',
      plot_bgcolor='rgb(243, 243, 243)',
      showlegend=False
  )

  fig.write_html(name_of_the_html_file)
  fig.show()

In [None]:
# Georgia state 2020 county level population distribution by race
# sample_data/html files/raw_county.html
# list2 is a list for raw data of county level population by race
list2 = [race['Hispanic'], race['White'], race['Black'], race['Asian'], race['Mixed'], race['Others']]
raw_distribution(list2, 'Georgia state 2020 county level population distribution by race', 'html_files/county_raw_distribution.html', race['Area Name'])

In [None]:
# precinct level data
precincts = pd.read_csv('data/polling_site_data/clean_precincts_with_polling_site.csv')

In [None]:
# raw precincts
# Georgia state 2020 precinct level population distribution by race
# sample_data/html files/raw_precinct.html
# list3 is a list for raw precinct level population data by race
list3 = [precincts['Hispanic'], precincts['White'], precincts['Black'], precincts['Asian'], precincts['Mixed'], precincts['Others']]
raw_distribution(list3, 'Georgia state 2020 precinct level population distribution by race', 'html_files/raw_precincts.html', precincts['Area Name'])

In [None]:
# general precinct racial density dist
# Georgia State 2020 precinct level population density distribution by race
# sample_data/html files/precinct_general_dist.html
# list4 is a list for precinct level population density by race
list4 = [precincts['Population Density: Hispanic'],precincts['Population Density: White'],precincts['Population Density: Black'],precincts['Population Density: Asian'], precincts['Population Density: Mixed'], precincts['Population Density: Others']]
distribution(list4, 'Georgia State 2020 precinct level population density distribution by race', 'html_files/precinct_general_dist.html', precincts['Area Name'])

# Identifying the outliers

Tukey's method function is based on [this website](https://towardsdatascience.com/detecting-and-treating-outliers-in-python-part-1-4ece5098b755)

In [49]:
#Tukey's method
def tukeys_method(variable):
    #Takes two parameters: dataframe & variable of interest as string
    q1 = variable.quantile(0.25)
    q3 = variable.quantile(0.75)
    iqr = q3-q1
    inner_fence = 1.5*iqr
    outer_fence = 3*iqr
    
    #inner fence lower and upper end
    inner_fence_le = q1-inner_fence
    inner_fence_ue = q3+inner_fence
    
    #outer fence lower and upper end
    outer_fence_le = q1-outer_fence
    outer_fence_ue = q3+outer_fence
    
    outliers_prob = []
    outliers_poss = []
    for index, x in enumerate(variable):
        if x <= outer_fence_le or x >= outer_fence_ue:
            outliers_prob.append(index)
    for index, x in enumerate(variable):
        if x <= inner_fence_le or x >= inner_fence_ue:
            outliers_poss.append(index)
    print(outliers_prob,outliers_poss, sep='\n')

In [50]:
# Tukey fences inner and outer performed
names = ['Outliers of county population ', 'Outliers of county level density', 'Outliers of precinct population', 'Outliers of precinct population density', 'Outliers of population density in precincts with polling sites', 'Outliers of population density in precincts without polling sites']
list_outliers = [list1, list2, list3, list4]

for lists, n in zip(list_outliers, ['Outliers of county population ', 'Outliers of county level density', 'Outliers of precinct population', 'Outliers of precinct population density']):
  print(n)
  for race_dist, r in zip(lists, ['Hispanic','White','Black','Asian','Mixed','Other']):
    print(r)
    tukeys_method(race_dist)      
  print('\n')


Outliers of county population 
Hispanic
[1, 49, 68, 154]
[1, 25, 34, 49, 66, 68, 127, 154]
White
[]
[30]
Black
[]
[]
Asian
[30, 32, 35, 43, 55, 57, 59, 66, 107]
[6, 24, 25, 28, 30, 32, 35, 43, 55, 57, 59, 66, 74, 75, 107, 127]
Mixed
[]
[25, 88, 90]
Other
[25]
[25, 32, 88, 90]


Outliers of county level density
Hispanic
[24, 27, 28, 30, 32, 43, 47, 57, 59, 66, 68, 74, 105, 154]
[6, 7, 21, 24, 27, 28, 30, 32, 35, 37, 43, 47, 55, 56, 57, 59, 66, 68, 74, 75, 105, 109, 120, 121, 154]
White
[24, 27, 32, 43, 57, 59, 66, 68, 109]
[7, 21, 24, 27, 28, 32, 35, 37, 43, 57, 59, 66, 68, 74, 75, 105, 109]
Black
[10, 24, 30, 32, 43, 46, 47, 59, 66, 74, 75, 91, 105, 106, 120, 121]
[10, 24, 28, 30, 32, 35, 37, 43, 46, 47, 55, 59, 66, 74, 75, 88, 91, 105, 106, 109, 120, 121]
Asian
[6, 10, 24, 27, 28, 30, 32, 35, 37, 43, 55, 57, 59, 66, 68, 74, 75, 105, 120]
[6, 10, 24, 27, 28, 30, 32, 35, 37, 43, 47, 55, 57, 59, 66, 68, 74, 75, 91, 105, 107, 109, 120]
Mixed
[24, 27, 32, 35, 43, 57, 59, 66, 74, 105, 120]


# Table of averages

A table of averages on the average racial density of all counties, all precincts and precincts with polling sites and precincts without polling sites.

In [None]:
# precincts with poling sites
withpolling = precincts[precincts['Polling Site 2020']==1]

# precincts without poling sites
withoutpolling = precincts[precincts['Polling Site 2020']==0]

In [None]:
# precincts with polling sites racial density distribution
# Georgia State 2020 population density distribution by race in precincts with polling sites
# list5 is a list population density by race in precincts with polling sites
list5 = [withpolling['Population Density: Hispanic'], withpolling['Population Density: White'], withpolling['Population Density: Black'], withpolling['Population Density: Asian'], withpolling['Population Density: Mixed'], withpolling['Population Density: Others']]

# precincts without polling sites population density distribution
# Georgia State 2020 population density distribution by race in precincts without polling sites
# list6 is a list of population densities by race in precincts without polling sites
list6 = [withoutpolling['Population Density: Hispanic'], withoutpolling['Population Density: White'], withoutpolling['Population Density: Black'], withoutpolling['Population Density: Asian'], withoutpolling['Population Density: Mixed'], withoutpolling['Population Density: Others']]

The following code was based on an example from https://plotly.com/python/table/#styled-table

In [None]:
import plotly.graph_objects as go

# list of all lists for averages by race
list_averages = [list4, list5, list6]

averages = []
for l in list_averages:
  av_list = []
  for i in l:
    av = sum(i)/len(i)
    av = round(av, 3)
    av_list.append(av)
  averages.append(av_list)
averages.insert(0, ['Hispanic','White','Black','Asian','Mixed','Other'])
# table
fig = go.Figure(data=[go.Table(
    header=dict(values=['Race', 'Average precinct population density', 
                        'Average population density in precincts with polling sites', 
                        'Average population density in precincts without polling sites'],
                line_color='darkslategray',
                fill_color='darksalmon',
                align='left'),
    cells=dict(values=averages,
               line_color='darkslategray',
               fill_color='white',
               align='left'))
])

fig.update_layout(width=1000, height=600)
fig.write_html('html_files/averages_table.html')
fig.show()