# Electoral College Data ETL

The purpose of this project is to analyze the how the electoral college votes were distributed after the 2010 census, look at how the redistribution of population (by estimate) has shifted over time until today, and what that means for the voter per electoral college vote in each state. I will also analyze how we expect the electoral college to be redistributed after the 2020 census, given census bureau predictions.

An analysis will also be performed on the percentage likelihood of each state giving its electoral college votes to a particular party and their respective nominees, based purely on historical data. The aim is to demonstrate which states have the greatest power per vote, given both their current electoral votes allotted and the likelihood of that state assigning thier votes to either party candidate.

## Import

In [32]:
#import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
import json

from bs4 import BeautifulSoup
import re

import sqlite3
%matplotlib inline

## Support Functions

In [2]:
#define how to retrieve api keys

def get_keys(path):
    """
    Pulls necessary api keys from designated path
    """
    with open(path) as f:
        return json.load(f)

In [3]:
def reciprocal_geometric_mean(next_house_seat):
    """
    Calculates the reciprocal geometric mean for the next house seat a state could potentially receive
    """
    return 1 / np.sqrt(next_house_seat*(next_house_seat-1))

In [4]:
def priority_value(state_pop, next_house_seat):
    """
    Calculates the priority value a state has for receiving another house seat.
    The highest priority value state gets the next house seat available.
    """
    return int(round(state_pop * reciprocal_geometric_mean(next_house_seat), 0))

In [5]:
def state_status_scrape(state_start):
    """
    From a beautiful soup webscrape, takes the tag just prior to the first state with NPVIC information
    and iterates through all siblings to make and return a dictionary of all states and their NPVIC status
    """
    state_npvic_status = {}
    for p in state_start.find_next_siblings():
        if p.contents == [] or p.contents == ['\xa0']:
            break

        soup_state_status = p.get_text().replace('\xa0', ' ')
        if '-' not in soup_state_status:
            state = soup_state_status
            status = ''
        else:
            dash_index = soup_state_status.index('-')
            state = soup_state_status[:dash_index-1]
            status = soup_state_status[dash_index+2:]

        state_npvic_status.update({state: status})
        
    return state_npvic_status

In [6]:
def state_status_to_numerical(state_npvic_status):
    """
    Takes all phrases from dictionary and assigns a value for how close the state is to joining the NPVIC
    Joined (passed legislature and signed by governor): 1.0
    Passed by one or two branches of state legislature: 0.5
    Approved by house committee:                        0.25
    States with no movement:                            0
    
    """
    for state in state_npvic_status.keys():
        if 'Enacted into law' in state_npvic_status[state]:
            state_npvic_status[state] = 1.0
        elif 'Passed' in state_npvic_status[state]:
            state_npvic_status[state] = 0.5
        elif 'approved' in state_npvic_status[state]:
            state_npvic_status[state] = 0.25
        else:
            state_npvic_status[state] = 0

    return state_npvic_status

In [7]:
def sql_table_creation(connection, cursor, table_name, *argv):
    """
    Passing in the connection to the SQL database, the cursor, the proposed table name, and any number of
    sequential arguments for columns of the table and associated data types, either create a new table with
    said information, or if the table already exists, drop the existing table, and create it newly.
    
    """
    var_string = argv[0]
    var_list = argv[0].split(' ')[0]
    for arg in argv[1:]:
        var_string += ', ' + arg
        var_list += ', ' + arg.split(' ')[0]
        
    try:
        c.execute('CREATE TABLE {} ({})'.format(table_name, var_string))
        conn.commit()
        print('{} Table created ({})'.format(table_name, var_list))
    except:
        c.execute('DROP TABLE {}'.format(table_name))
        print('{} table dropped'.format(table_name))
        c.execute('CREATE TABLE {} ({})'.format(table_name, var_string))
        conn.commit()
        print('{} table created ({})'.format(table_name, var_list))

## Data Collection

### API Request

Retrieve my personal API key for querying the Census Bureau.

In [8]:
#get key for census bureau api
key_path = "/Users/flatironschool/.secret/census_api.json"
keys = get_keys(key_path)

api_key = keys['api_key']

After a ton of research into the Census Bureau API and the information available, I found the right link, the right variables, and the right order for the parameters to send for the information I needed.

In [9]:
#make and print request for census count and estimates from 2010-2019 for all states
year = '2019'

url = 'https://api.census.gov/data/{}/pep/population'.format(year)

variables = ['DATE_CODE', 'DATE_DESC', 'POP', 'NAME']

granularity = 'state:*'

params = {'get': ','.join(variables), 'for': granularity, 'key': api_key}

r = requests.get(url, params=params)
print(r.url)
print(r)
print(r.text[:1000])

https://api.census.gov/data/2019/pep/population?get=DATE_CODE%2CDATE_DESC%2CPOP%2CNAME&for=state%3A%2A&key=b7961d22ec04ff1777be8a0450921d3f28af8315
<Response [200]>
[["DATE_CODE","DATE_DESC","POP","NAME","state"],
["1","4/1/2010 Census population","5303925","Minnesota","27"],
["2","4/1/2010 population estimates base","5303927","Minnesota","27"],
["3","7/1/2010 population estimate","5310828","Minnesota","27"],
["4","7/1/2011 population estimate","5346143","Minnesota","27"],
["5","7/1/2012 population estimate","5376643","Minnesota","27"],
["6","7/1/2013 population estimate","5413479","Minnesota","27"],
["7","7/1/2014 population estimate","5451079","Minnesota","27"],
["8","7/1/2015 population estimate","5482032","Minnesota","27"],
["9","7/1/2016 population estimate","5522744","Minnesota","27"],
["10","7/1/2017 population estimate","5566230","Minnesota","27"],
["11","7/1/2018 population estimate","5606249","Minnesota","27"],
["12","7/1/2019 population estimate","5639632","Minnesota","27"],

In [91]:
#take the api response, turn it into a json, then parse that json into a pandas dataframe
data = r.json()
pop_df = pd.DataFrame(data[1:], columns=data[0])

#cut down the date column to just the year
pop_df['YEAR'] = pop_df.DATE_DESC.apply(lambda x: x[4:8])

#eliminate the census population estimate for the same year it was recorded and the July date of that same year
pop_df.drop(pop_df[(pop_df.DATE_CODE == '2') | (pop_df.DATE_CODE == '3')].index, inplace=True)

#drop unnecessary columns and reset the index
pop_df.drop(['state','DATE_CODE', 'DATE_DESC'], axis=1, inplace=True)
pop_df.reset_index(drop=True, inplace=True)

#set the population to an integer val, reorder and rename the columns
pop_df['POP'] = pop_df.POP.astype('int64')
pop_df = pop_df[['YEAR', 'NAME', 'POP']]
pop_df.columns = ['Year', 'State', 'Pop']

display(pop_df.head(15))
display(pop_df.info())

Unnamed: 0,Year,State,Pop
0,2010,Minnesota,5303925
1,2011,Minnesota,5346143
2,2012,Minnesota,5376643
3,2013,Minnesota,5413479
4,2014,Minnesota,5451079
5,2015,Minnesota,5482032
6,2016,Minnesota,5522744
7,2017,Minnesota,5566230
8,2018,Minnesota,5606249
9,2019,Minnesota,5639632


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 3 columns):
Year     520 non-null object
State    520 non-null object
Pop      520 non-null int64
dtypes: int64(1), object(2)
memory usage: 12.3+ KB


None

So we finally end up with the information needed, actual census data from 2010, and the estimates the Census Bureau has made about the population since the last official census. What these numbers don't include though (and is crucial when calculating who gets how many electoral votes) are the number of citizens of these states overseas. Let's get that data next.

### Overseas Population Import

Sadly, I couldn't find this info through the API, and had to direct download the csv from the Census Bureau.

In [92]:
overseas_df = pd.read_excel('Overseas Population 2010.xls', skiprows=7)
overseas_df.dropna(inplace=True)
overseas_df.columns = ['State', 'Overseas_pop']
overseas_df['Year'] = '2010'
overseas_df['Overseas_pop'] = overseas_df['Overseas_pop'].astype('int64')
overseas_df.reset_index(drop=True, inplace=True)

display(overseas_df.head())
display(overseas_df.info())

Unnamed: 0,State,Overseas_pop,Year
0,Alabama,23246,2010
1,Alaska,11292,2010
2,Arizona,20683,2010
3,Arkansas,10311,2010
4,California,88033,2010


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 3 columns):
State           52 non-null object
Overseas_pop    52 non-null int64
Year            52 non-null object
dtypes: int64(1), object(2)
memory usage: 1.3+ KB


None

This data is only for the year the 2010 census was completed, but where are the estimates? After some digging, I found a court filing where the process of calculating estimates for overseas population per state was discussed, but the Census Bureau said effectively that to survey overseas populations for estimation purposes was too costly and too time intensive (read: we don't have a big enough budget for that). While they would continue to measure the overseas population during the actual Census, there is no produced data for overseas estimates.

So, let's make it instead! As my real focus is on the electoral college, which is based on the total population (measured population in the US + Overseas population) I need something to predict what the 2020 electoral college will look like. Since 2010, a number of our troops have come home from the Middle East, but the movement of populations in and out of states shouldn't be affected by troops coming home or other reasons state residents are abroad. Soldiers and other ex-pats are just as likely to move to another state when coming home as the next individual. I therefore was comfortable interpolating the overseas population for 2011 - 2019 such that its growth or decline matched the movement of the non-overseas population from year to year.

I calculate the percent change in the non-overseas population from year to year for each state, and adjust the overseas population by that same percentage.

In [94]:
#merge the overseas data with the main population estimates table and extrapolate overseas estimates based on 
#percentage change in the population year over year

#merge the population estimates with overseas data on State and Year
merged_df = pop_df.merge(overseas_df, how='left', on=['State','Year'])

#create our Percent change in population column
merged_df['Percent_change'] = (merged_df.Pop - merged_df.Pop.shift(1)) / merged_df.Pop.shift(1)

#iterate through the missing data in overseas population, calculating the next missing value from the previous known
#(from the 2010 census) or from the last calculated value based on percentage change in that state
for i in range(len(merged_df)):
    if pd.isna(merged_df.iloc[i]['Overseas_pop']):
        merged_df.at[i,'Overseas_pop'] = round(merged_df.iloc[i]['Percent_change'] * \
                                               merged_df.iloc[i-1]['Overseas_pop'] + \
                                               merged_df.iloc[i-1]['Overseas_pop'], 0)
        
merged_df.drop('Percent_change', axis=1, inplace=True)        
merged_df['Overseas_pop'] = merged_df.Overseas_pop.astype('int64')
merged_df['Year'] = merged_df.Year.astype('int64')

display(merged_df.head(25))
display(merged_df.info())

Unnamed: 0,Year,State,Pop,Overseas_pop
0,2010,Minnesota,5303925,10954
1,2011,Minnesota,5346143,11041
2,2012,Minnesota,5376643,11104
3,2013,Minnesota,5413479,11180
4,2014,Minnesota,5451079,11258
5,2015,Minnesota,5482032,11322
6,2016,Minnesota,5522744,11406
7,2017,Minnesota,5566230,11496
8,2018,Minnesota,5606249,11579
9,2019,Minnesota,5639632,11648


<class 'pandas.core.frame.DataFrame'>
Int64Index: 520 entries, 0 to 519
Data columns (total 4 columns):
Year            520 non-null int64
State           520 non-null object
Pop             520 non-null int64
Overseas_pop    520 non-null int64
dtypes: int64(3), object(1)
memory usage: 40.3+ KB


None

### Scraping National Popular Vote Interstate Compact State Status

The thing that could upend this whole electoral college institution is the NPVIC, which goes into effect once the states who have ratified it have enough electoral votes to add up to 270. Every state who has signed says they will give all their electoral votes to whomever wins the popular vote NATIONWIDE, not just who wins the popular vote in their particular state (it has some pretty solid footing for being legitimate by the constitution). I want to look at how close we actually are to implementing this, if it's even a reality. I found a website that updates as states move forward in their ratification of the NPVIC, and shows how far (or not) a state is in the process.

We do a little webscraping using BeautifulSoup to find the information we need.

In [100]:
url = "https://www.nationalpopularvote.com/state-status"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')

In [101]:
#skips over the initial list (and links) of states at the top of the page
init_state_skip = soup.find("p", text=re.compile("On the map below"))

In [102]:
#brings us to the tag just before the states and their current NPVIC status
tag_before_states = init_state_skip.find_next_sibling().find_next_sibling()

print(tag_before_states)
print(tag_before_states.find_next_sibling())

<p> </p>
<p><a class="menu__link" href="/state/ak">Alaska</a></p>


In [103]:
#scrape the site for NPVIC status of each state
npvic_word_status = state_status_scrape(tag_before_states)
#make the status numerical
npvic_num_status = state_status_to_numerical(npvic_word_status)

#turns dict into pandas dataframe
npvic_df = pd.DataFrame.from_dict(npvic_num_status, orient='index')
npvic_df.reset_index(inplace=True)
npvic_df.columns = ['State', 'Status']

display(npvic_df.head())
display(npvic_df.info())

Unnamed: 0,State,Status
0,Alaska,0.0
1,Alabama,0.0
2,Arkansas,0.5
3,Arizona,0.5
4,California,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
State     51 non-null object
Status    51 non-null float64
dtypes: float64(1), object(1)
memory usage: 944.0+ bytes


None

### Election History Data

Now what really matters to most people is how our elections will be affected, and for candidates who are running a presidential campaign, what states are safe, and which they should be worried about. From the MIT Election Lab (https://electionlab.mit.edu/data) I found a dataset for all presidential elections, by state, from 1976 to 2016.

We do a little EDA on the data (details below) and reorganize the columns into something more readable.

Some things of note:
- In exploration of the data, there was a "democrat/republican" that someone once ran as (James Gritz, only in CO, and in 1992) but that was excluded from the purely democrat/republican divide I want to study here
- Any write in candidates were also not considered, as they are effectively outliers for this study
- This dataset was only for US Presidential elections, so the office column is superfluous
- In 2000, 2004, 2012, nominees Al Gore, John Kerry and Barack Obama had to run under the 'democratic-farmer-labor' party. That party name has been changed to 'democrat' for ease of analysis

In [107]:
vote_history = pd.read_csv('1976-2016-president.csv')

#changing the democratic-farmer-labor party to just democrat for 2000, 2004, and 2012
vote_history.replace(to_replace='democratic-farmer-labor', value='democrat', inplace=True)

#drop anything that's a write in, or not in the main two parties, and reset the index
vote_history = vote_history[(vote_history['party'].isin(['democrat', 'republican'])) & 
                            (vote_history['writein'] == False)]

#get rid of unnecessary columns
drop_vote_history_cols = ['state_po', 'state_fips', 'state_cen', 'writein', 'state_ic', 'office', 'version', 'notes']
vote_history.drop(drop_vote_history_cols, axis=1, inplace=True)

#combine two rows for each state into one row with all info for that year/state
vote_history = vote_history.sort_values(['year', 'state', 'party'])
dems = vote_history[vote_history.party == 'democrat'].drop('party', axis=1)
reps = vote_history[vote_history.party == 'republican'].drop('party', axis=1)
vote_history = dems.merge(reps, on=['year', 'state', 'totalvotes'])
vote_history.rename({'candidate_x': 'dem_cand', 'candidatevotes_x': 'dem_vote',
                     'candidate_y': 'rep_cand', 'candidatevotes_y': 'rep_vote'}, axis = 1, inplace=True)

#move the column totalvotes to the end of the dataframe
new_col_order = list(vote_history.columns)
new_col_order.remove('totalvotes')
new_col_order.append('totalvotes')

vote_history = vote_history[new_col_order]

display(vote_history.head(10))
display(vote_history.info())

Unnamed: 0,year,state,dem_cand,dem_vote,rep_cand,rep_vote,totalvotes
0,1976,Alabama,"Carter, Jimmy",659170,"Ford, Gerald",504070,1182850
1,1976,Alaska,"Carter, Jimmy",44058,"Ford, Gerald",71555,123574
2,1976,Arizona,"Carter, Jimmy",295602,"Ford, Gerald",418642,742719
3,1976,Arkansas,"Carter, Jimmy",498604,"Ford, Gerald",267903,767535
4,1976,California,"Carter, Jimmy",3742284,"Ford, Gerald",3882244,7803770
5,1976,Colorado,"Carter, Jimmy",460801,"Ford, Gerald",584278,1081440
6,1976,Connecticut,"Carter, Jimmy",647895,"Ford, Gerald",719261,1386355
7,1976,Delaware,"Carter, Jimmy",122461,"Ford, Gerald",109780,235642
8,1976,District of Columbia,"Carter, Jimmy",137818,"Ford, Gerald",27873,168830
9,1976,Florida,"Carter, Jimmy",1636000,"Ford, Gerald",1469531,3150631


<class 'pandas.core.frame.DataFrame'>
Int64Index: 561 entries, 0 to 560
Data columns (total 7 columns):
year          561 non-null int64
state         561 non-null object
dem_cand      561 non-null object
dem_vote      561 non-null int64
rep_cand      561 non-null object
rep_vote      561 non-null int64
totalvotes    561 non-null int64
dtypes: int64(4), object(3)
memory usage: 35.1+ KB


None

### Merging all data into SQL Database

We'll take all this data that we've collected through APIs, webscraping, and downloaded CSV files, and throw them all into a SQL database for easier analysis in another notebook!

In [110]:
#put all data into SQL database for retrieval in analysis
conn = sqlite3.connect('census_pop_data.db')
c = conn.cursor()

#### Population Data

In [111]:
# sql_table_creation(conn, c, 'Population', 'Year integer', 'State text', 'Pop integer', 'Overseas_pop integer')

In [112]:
merged_df.to_sql(name='Population', con=conn, if_exists='replace')

#### NPVIC Status

In [113]:
# sql_table_creation(conn, c, 'NPVIC_status', 'State text', 'Status real')

In [114]:
npvic_df.to_sql(name='NPVIC_status', con=conn, if_exists='replace')

#### Election Results

In [115]:
# sql_table_creation(conn, c, 'Election_results', 'Year integer', 'State text', 'Dem_cand text', 'Dem_vote integer', 
#                    'Rep_cand text', 'Rep_vote integer', 'Totalvotes integer')

In [116]:
vote_history.to_sql(name='Election_results', con=conn, if_exists='replace')

### Check the input

In [117]:
c.execute('SELECT year, state, pop, overseas_pop FROM population')
for row in c.fetchall():
    print(row)

(2010, 'Minnesota', 5303925, 10954)
(2011, 'Minnesota', 5346143, 11041)
(2012, 'Minnesota', 5376643, 11104)
(2013, 'Minnesota', 5413479, 11180)
(2014, 'Minnesota', 5451079, 11258)
(2015, 'Minnesota', 5482032, 11322)
(2016, 'Minnesota', 5522744, 11406)
(2017, 'Minnesota', 5566230, 11496)
(2018, 'Minnesota', 5606249, 11579)
(2019, 'Minnesota', 5639632, 11648)
(2010, 'Mississippi', 2967297, 10943)
(2011, 'Mississippi', 2978731, 10985)
(2012, 'Mississippi', 2983816, 11004)
(2013, 'Mississippi', 2988711, 11022)
(2014, 'Mississippi', 2990468, 11028)
(2015, 'Mississippi', 2988471, 11021)
(2016, 'Mississippi', 2987938, 11019)
(2017, 'Mississippi', 2988510, 11021)
(2018, 'Mississippi', 2981020, 10993)
(2019, 'Mississippi', 2976149, 10975)
(2010, 'Missouri', 5988927, 22551)
(2011, 'Missouri', 6010275, 22631)
(2012, 'Missouri', 6024367, 22684)
(2013, 'Missouri', 6040715, 22746)
(2014, 'Missouri', 6056202, 22804)
(2015, 'Missouri', 6071732, 22862)
(2016, 'Missouri', 6087135, 22920)
(2017, 'Missour

In [118]:
c.execute('SELECT state, status FROM npvic_status')
for row in c.fetchall():
    print(row)

('Alaska', 0.0)
('Alabama', 0.0)
('Arkansas', 0.5)
('Arizona', 0.5)
('California', 1.0)
('Colorado', 1.0)
('Connecticut', 1.0)
('District of Columbia', 1.0)
('Delaware', 1.0)
('Florida', 0.0)
('Georgia', 0.25)
('Hawaii', 1.0)
('Iowa', 0.0)
('Idaho', 0.0)
('Illinois', 1.0)
('Indiana', 0.0)
('Kansas', 0.0)
('Kentucky', 0.0)
('Louisiana', 0.0)
('Massachusetts', 1.0)
('Maryland', 1.0)
('Maine', 0.5)
('Michigan', 0.5)
('Minnesota', 0.5)
('Missouri', 0.25)
('Mississippi', 0.0)
('Montana', 0.0)
('North Carolina', 0.5)
('North Dakota', 0.0)
('Nebraska', 0.0)
('New Hampshire', 0.0)
('New Jersey', 1.0)
('New Mexico', 1.0)
('Nevada', 0.5)
('New York', 1.0)
('Ohio', 0.0)
('Oklahoma', 0.5)
('Oregon', 1.0)
('Pennsylvania', 0.0)
('Rhode Island', 1.0)
('South Carolina', 0.0)
('South Dakota', 0.0)
('Tennessee', 0.0)
('Texas', 0.0)
('Utah', 0.0)
('Virginia', 0.0)
('Vermont', 1.0)
('Washington', 1.0)
('Wisconsin', 0.0)
('West Virginia', 0.0)
('Wyoming', 0.0)


In [119]:
c.execute('SELECT year, state, dem_cand, dem_vote, rep_cand, rep_vote, totalvotes FROM election_results')
for row in c.fetchall():
    print(row)

(1976, 'Alabama', 'Carter, Jimmy', 659170, 'Ford, Gerald', 504070, 1182850)
(1976, 'Alaska', 'Carter, Jimmy', 44058, 'Ford, Gerald', 71555, 123574)
(1976, 'Arizona', 'Carter, Jimmy', 295602, 'Ford, Gerald', 418642, 742719)
(1976, 'Arkansas', 'Carter, Jimmy', 498604, 'Ford, Gerald', 267903, 767535)
(1976, 'California', 'Carter, Jimmy', 3742284, 'Ford, Gerald', 3882244, 7803770)
(1976, 'Colorado', 'Carter, Jimmy', 460801, 'Ford, Gerald', 584278, 1081440)
(1976, 'Connecticut', 'Carter, Jimmy', 647895, 'Ford, Gerald', 719261, 1386355)
(1976, 'Delaware', 'Carter, Jimmy', 122461, 'Ford, Gerald', 109780, 235642)
(1976, 'District of Columbia', 'Carter, Jimmy', 137818, 'Ford, Gerald', 27873, 168830)
(1976, 'Florida', 'Carter, Jimmy', 1636000, 'Ford, Gerald', 1469531, 3150631)
(1976, 'Georgia', 'Carter, Jimmy', 979409, 'Ford, Gerald', 483743, 1463152)
(1976, 'Hawaii', 'Carter, Jimmy', 147375, 'Ford, Gerald', 140003, 291301)
(1976, 'Idaho', 'Carter, Jimmy', 126549, 'Ford, Gerald', 204151, 340932)

In [120]:
c.close()
conn.close()