In [101]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import scipy
from scipy.stats import wilcoxon
import datetime
from dateutil.relativedelta import relativedelta


Set the constants needed to process the data.

Date of each election being processed.

Party Affiliation of Candidates

In [102]:
election_days = {
    2024: datetime.date(2024, 11, 5),
    2020: datetime.date(2020, 11, 3),
    2016: datetime.date(2016, 11, 8),
    2012: datetime.date(2012, 11, 6),
    2008: datetime.date(2008, 11, 4),
    2004: datetime.date(2004, 11, 2)
}

candidate_parties = {
    'Donald Trump': 'Republican', 
    'Hillary Rodham Clinton': 'Democrat', 
    'Barack Obama': 'Democrat', 
    'Mitt Romney': 'Republican', 
    'John McCain': 'Republican', 
    'George W. Bush': 'Republican', 
    'John Kerry': 'Democrat',
    'Harris': 'Democrat',
    'Trump': 'Republican',
    'Joseph R. Biden Jr.': 'Democrat'
}

fix_party = {'DEM': 'Democrat', 'REP': 'Republican'}

standard_states = {
    'Nebraska CD-1': 'NE-1',
    'Nebraska CD-2': 'NE-2',
    'Nebraska CD-3': 'NE-3',
    'Maine CD-1': 'ME-1',
    'Maine CD-2': 'ME-2'
}

**Load Datasets**

Load the data for past election results from

MIT Election Data and Science Lab, 2017, "U.S. President 1976–2020", https://doi.org/10.7910/DVN/42MVDX, Harvard Dataverse, V8, UNF:6:F0opd1IRbeYI9QyVfzglUw== [fileUNF]

In [103]:
past_elections = pd.read_csv('1976-2020-president.csv')

**Data Exploration**

In [104]:
## No exploration so far

**Data Processing**

Extract the important columns.

Then, for each state in each cycle, assign the party which won that state, this will be used later to label the data.

In [105]:
past_elections = past_elections[['state', 'year', 'party_detailed', 'candidatevotes']]

winning_parties = (past_elections.loc[past_elections.groupby(['state', 'year'])['candidatevotes'].idxmax()][['state', 'year', 'party_detailed']])

Load the polling data for all the elections from the two csv files.

Convert the dates to datetime format for easier processing, and rename the columns so that they match between the dataframes.

In [106]:
polling_20to24 = pd.read_csv('presidential_general_averages.csv')

## Convert columns to useable datatypes
polling_20to24['date'] = pd.to_datetime(polling_20to24['date']).dt.date
polling_20to24['cycle'] = polling_20to24['cycle'].astype(int)

polling_else = pd.read_csv('pres_pollaverages_1968-2016.csv')

## Convert columns to useable datatypes
polling_else['modeldate'] = pd.to_datetime(polling_else['modeldate']).dt.date
polling_else['cycle'] = polling_else['cycle'].astype(int)

## Extract important columns from both dataframes
important_20to24 = polling_20to24[['state', 'cycle', 'candidate', 'date', 'pct_estimate']].copy()
important_else = polling_else[['state', 'cycle', 'candidate_name', 'modeldate', 'pct_estimate']][polling_else['cycle'] >= 2004]

## Rename columns so that they match
important_20to24.rename(columns={'candidate': 'party'}, inplace=True)
important_else.rename(columns={'candidate_name': 'party', 'modeldate': 'date'}, inplace=True)

## Convert candidate names to their corresponding party
important_20to24['party'] = important_20to24['party'].map(lambda e: candidate_parties[e] if e in candidate_parties.keys() else e)
important_else['party'] = important_else['party'].map(lambda e: candidate_parties[e] if e in candidate_parties.keys() else e)

## Fill in missing values
important_20to24['pct_estimate'] = important_20to24['pct_estimate'].fillna(polling_20to24['pct_trend_adjusted'])

## Combine all cycles into one dataframe
combined_df = pd.concat([important_20to24, important_else], ignore_index=True)
combined_df.reset_index(inplace=True)
combined_df = combined_df[combined_df['state'] != 'National']

## Fix some inconsistent naming of the congressional districts for Nebraska and Maine
combined_df['state'] = combined_df['state'].map(lambda e: 'NE-2' if e == 'Nebraska Cd 2' else e)
combined_df['state'] = combined_df['state'].map(lambda e: 'ME-2' if e == 'Maine Cd 2' else e)

From our data exploration, we know that the 2024 data is missing for quite a few states, so we need to add those states using the raw polling data.
1. Load the raw polling data

In [107]:
poll_data = pd.read_csv('president_polls.csv')

Grab the most important columns

In [108]:
processed_polls = poll_data[['pollster_id', 'state', 'end_date', 'party', 'pct', 'office_type']]

## Ignore the rows with NaN states bc those are national polls
processed_polls = processed_polls[processed_polls['state'].notna()].reset_index()

## Convert the dates to datetime
processed_polls['end_date'] = pd.to_datetime(processed_polls['end_date'], format="mixed", dayfirst=False).dt.date

2. Get the pollsters with grades higher than 2.3. This ensures that the polling average is not skewed by biased pollsters.

In [109]:
top_pollsters = poll_data.groupby('pollster_id', as_index=False).agg({'numeric_grade': 'max'})
top_pollsters.sort_values(by='numeric_grade', ascending=False, inplace=True)
top_pollsters = top_pollsters[top_pollsters['numeric_grade'] > 1.6]['pollster_id'].to_list()

Remove all entries that aren't by the top pollsters. And fix the state names for the Nebraska and Maine districts

In [110]:
processed_polls = processed_polls[processed_polls['pollster_id'].isin(top_pollsters)]
processed_polls['party'] = processed_polls['party'].map(lambda e: fix_party[e] if e in fix_party.keys() else e)
processed_polls['state'] = processed_polls['state'].map(lambda e: standard_states[e] if e in standard_states.keys() else e)

3. Add the necessary columns to the polls to match the combined_df, and also reorder them too

In [111]:
processed_polls['cycle'] = 2024
processed_polls = processed_polls[['state', 'cycle', 'party', 'end_date', 'pct']].copy()

4. Concat the new data onto the combined data

In [112]:
processed_polls.rename(columns={'pct': 'pct_estimate', 'end_date': 'date'}, inplace=True)
combined_df = pd.concat([combined_df, processed_polls], ignore_index=True)

Some states don't have polls within the month of the 2024 election because of how recent it was, so we will instead use the most recent poll for those states.

In [113]:
most_recent = processed_polls.groupby('state')['date'].max().reset_index()
most_recent = most_recent[most_recent['date'] < (election_days[2024] + relativedelta(months=-1))].to_dict()

temp = {}
for k in most_recent['state'].keys():
    temp[most_recent['state'][k]] = most_recent['date'][k]

most_recent = temp
most_recent
processed_polls.groupby('state').count()

Unnamed: 0_level_0,cycle,party,date,pct_estimate
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,6,6,6,6
Alaska,33,33,33,33
Arizona,631,631,631,631
Arkansas,10,10,10,10
California,130,130,130,130
Colorado,33,33,33,33
Connecticut,13,13,13,13
Delaware,4,4,4,4
Florida,241,241,241,241
Georgia,658,658,658,658


For each of the elections being processed, get the one month average for each major party, 



In [114]:
processed_data = None

for year in election_days.keys():
    ## Set the dates needed, election day, one and two months before
    election_date = election_days[year]
    one_month_before = election_date + relativedelta(months=-1)
    two_month_before = election_date + relativedelta(months=-2)
    
    ## Filter combined dataframe with the dates
    one_month_filtered = combined_df[(combined_df['date'] >= one_month_before) & (combined_df['date'] <= election_date) & (combined_df['party'].isin(['Democrat', 'Republican']))].reset_index(drop=True)

    print(one_month_filtered[one_month_filtered['state'] == 'Alabama'])
    
    two_month_filtered = combined_df[(combined_df['date'] >= two_month_before) & (combined_df['date'] <= one_month_before) & (combined_df['party'].isin(['Democrat', 'Republican']))]

    if year == 2024:
        for st in most_recent.keys():
            date = most_recent[st]

            recent = combined_df[(combined_df['date'] >= date + relativedelta(months=-1)) & (combined_df['date'] <= date) & (combined_df['party'].isin(['Democrat', 'Republican'])) & (combined_df['state'] == st)]

            one_month_filtered = pd.concat([one_month_filtered, recent], ignore_index=True).reset_index(drop=True)
        
        two_month_filtered = two_month_filtered[~two_month_filtered['state'].isin(most_recent.keys())]

    ## Group by state then party, then get average over month
    grouped_one_month = one_month_filtered.groupby(['state', 'party'])['pct_estimate'].mean().reset_index()

    grouped_two_month = two_month_filtered.groupby(['state', 'party'])['pct_estimate'].mean().reset_index()

    ## Pivot dataframe so that the Democrat and Republican polling averages are on the same rows
    temp = grouped_two_month.pivot(index='state', columns='party', values='pct_estimate').reset_index()

    ## Reorder and rename columns
    temp = temp[['state', 'Republican', 'Democrat']]
    temp.columns = ['state', 'polling_avg_republican2', 'polling_avg_democrat2']

    temp['cycle'] = year

    ## Create a key to merge the one month and two month dataframes on
    temp['jkey'] = temp['state'] + temp['cycle'].astype(str)

    ## Repeat above for the one month dataframe
    result = grouped_one_month.pivot(index='state', columns='party', values='pct_estimate').reset_index()

    result = result[['state', 'Republican', 'Democrat']]

    result.columns = ['state', 'polling_avg_republican', 'polling_avg_democrat']

    result['cycle'] = year
    result['jkey'] = result['state'] + result['cycle'].astype(str)

    ## Merge the two dataframes
    result = pd.merge(result, temp[['polling_avg_republican2', 'polling_avg_democrat2', 'jkey']], on='jkey', how='outer')

    ## Convert the 2 month polling average columns into polling change columns
    result['polling_avg_republican2'] -= result['polling_avg_republican']
    result['polling_avg_democrat2'] -= result['polling_avg_democrat']

    result.columns = ['state', 'polling_avg_republican', 'polling_avg_democrat', 'cycle', 'jkey', 'month_change_republican', 'month_change_democrat']

    # result.drop(['jkey', 'cycle'], axis=1, inplace=True)

    ## Add the new dataframe onto the accumulator
    if processed_data is None:
        processed_data = result
    else:
        processed_data = pd.concat([processed_data, result], ignore_index=True).reset_index(drop=True)

processed_data.fillna(0, inplace=True)
processed_data.count()

Empty DataFrame
Columns: [index, state, cycle, party, date, pct_estimate]
Index: []
    index    state  cycle       party        date  pct_estimate
0     0.0  Alabama   2020    Democrat  2020-11-03      37.82732
1     1.0  Alabama   2020  Republican  2020-11-03      57.36126
2     2.0  Alabama   2020    Democrat  2020-11-02      37.82732
3     3.0  Alabama   2020  Republican  2020-11-02      57.36126
4     4.0  Alabama   2020    Democrat  2020-11-01      38.07011
..    ...      ...    ...         ...         ...           ...
59   59.0  Alabama   2020  Republican  2020-10-05      56.89531
60   60.0  Alabama   2020    Democrat  2020-10-04      38.79457
61   61.0  Alabama   2020  Republican  2020-10-04      56.16731
62   62.0  Alabama   2020    Democrat  2020-10-03      38.74995
63   63.0  Alabama   2020  Republican  2020-10-03      56.24868

[64 rows x 6 columns]
        index    state  cycle       party        date  pct_estimate
0     52757.0  Alabama   2016  Republican  2016-10-08    

state                      316
polling_avg_republican     316
polling_avg_democrat       316
cycle                      316
jkey                       316
month_change_republican    316
month_change_democrat      316
dtype: int64

**Labeling Data**

Now that we have the processed data, all that is left is for us to label the data since we will be using supervised learning.

The data labeling will consist of a new column which will contain either Democrat or Republican, marking which party won that state in the given cycle.

1. First, add the merging key to the winning_parties dataframe to prepare for merging with the processed data

In [115]:
winning_parties['jkey'] = winning_parties['state'].map(lambda e: e.lower()) + winning_parties['year'].astype(str)

2. Ensure that the merging key will match properly, so convert all to lower case.

In [116]:
processed_data['jkey'] = processed_data['jkey'].map(lambda e: e.lower())

3. Merge the two dataframes so that the data is all labeled, except for the 2024 data, which will need to be filled in manually.

**Make sure to check for NaN values, especially due to the Maine and Nebraska splitting their electoral votes.**

In [117]:
# result = pd.merge(result, temp[['polling_avg_republican2', 'polling_avg_democrat2', 'jkey']], on='jkey', how='outer')
processed_data = pd.merge(processed_data, winning_parties[['party_detailed', 'jkey']], on='jkey', how='left')

In [118]:
processed_data.rename(columns={'party_detailed': 'winning_party'}, inplace=True)
processed_data.count()

state                      316
polling_avg_republican     316
polling_avg_democrat       316
cycle                      316
jkey                       316
month_change_republican    316
month_change_democrat      316
winning_party              248
dtype: int64

In [119]:
processed_data[processed_data['cycle'] == 2024].count()

state                      55
polling_avg_republican     55
polling_avg_democrat       55
cycle                      55
jkey                       55
month_change_republican    55
month_change_democrat      55
winning_party               0
dtype: int64

There it is, there are 52 entries from the 2024 cycle which are expected to not have labels, yet 65 rows are missing a winning_party label.

4. We need to fix some of that missing data, unfortunately this will need to be done manually.

Get the merge keys for the missing values that are not from 2024. Any 2024 cycle row is expected to not have a label because the official results are not out yet and thus aren't in any dataset.

In [120]:
missing_jkeys = processed_data[(processed_data['winning_party'].isna()) & (processed_data['cycle'] != 2024)]['jkey'].to_list()
missing_jkeys

['me-12020',
 'me-22020',
 'ne-22020',
 'me-12016',
 'me-22016',
 'ne-12016',
 'ne-22016',
 'ne-32016',
 'me-12012',
 'me-22012',
 'ne-12012',
 'ne-22012',
 'ne-32012']

In [121]:
processed_data[(processed_data['winning_party'].isna()) & (processed_data['cycle'] != 2024)][['state', 'cycle']]

Unnamed: 0,state,cycle
74,ME-1,2020
75,ME-2,2020
84,NE-2,2020
128,ME-1,2016
129,ME-2,2016
138,NE-1,2016
139,NE-2,2016
140,NE-3,2016
182,ME-1,2012
183,ME-2,2012


Create a dataframe with the missing data, using the jkey.

In [122]:
missing_data = {
    'jkey': missing_jkeys,
    'winning_party': [
        'Democrat',
        'Republican',
        'Democrat',
        'Democrat',
        'Republican',
        'Republican',
        'Republican',
        'Republican',
        'Democrat',
        'Democrat',
        'Republican',
        'Republican',
        'Republican'
    ]
}

Now merge it with the dataset

In [123]:
for key in missing_data['jkey']:
    processed_data.loc[processed_data['jkey'] == key, 'winning_party'] = missing_data['winning_party'][missing_data['jkey'].index(key)]

Check that it worked. It did, only 52 entries are missing labels. This is expected.

In [124]:
processed_data.count()

state                      316
polling_avg_republican     316
polling_avg_democrat       316
cycle                      316
jkey                       316
month_change_republican    316
month_change_democrat      316
winning_party              261
dtype: int64

**Final Fixups**

Time to add the labels for 2024, unfortunately this must be done manually since the 2024 results aren't available in any dataset yet.

In [125]:
election_results = {
    "Alabama": "Republican",
    "Alaska": "Republican",
    "Arizona": "Republican",
    "Arkansas": "Republican",
    "California": "Democrat",
    "Colorado": "Democrat",
    "Connecticut": "Democrat",
    "Delaware": "Democrat",
    "Florida": "Republican",
    "Georgia": "Republican",
    "Hawaii": "Democrat",
    "Idaho": "Republican",
    "Illinois": "Democrat",
    "Indiana": "Republican",
    "Iowa": "Republican",
    "Kansas": "Republican",
    "Kentucky": "Republican",
    "Louisiana": "Republican",
    "Maine": "Democrat",
    "ME-1": "Democrat", 
    "ME-2": "Republican",
    "Maryland": "Democrat",
    "Massachusetts": "Democrat",
    "Michigan": "Republican",
    "Minnesota": "Democrat",
    "Mississippi": "Republican",
    "Missouri": "Republican",
    "Montana": "Republican",
    "Nebraska": "Republican", 
    "NE-1": "Republican", 
    "NE-2": "Democrat", 
    "NE-3": "Republican",
    "Nevada": "Republican",
    "New Hampshire": "Democrat",
    "New Jersey": "Democrat",
    "New Mexico": "Democrat",
    "New York": "Democrat",
    "North Carolina": "Republican",
    "North Dakota": "Republican",
    "Ohio": "Republican",
    "Oklahoma": "Republican",
    "Oregon": "Democrat",
    "Pennsylvania": "Republican",
    "Rhode Island": "Democrat",
    "South Carolina": "Republican",
    "South Dakota": "Republican",
    "Tennessee": "Republican",
    "Texas": "Republican",
    "Utah": "Republican",
    "Vermont": "Democrat",
    "Virginia": "Democrat",
    "Washington": "Democrat",
    "West Virginia": "Republican",
    "Wisconsin": "Republican",
    "Wyoming": "Republican",
    "District of Columbia": "Democrat"
}

In [126]:
for v in sorted(election_results.keys()):
    if v not in list(processed_data[processed_data['cycle'] == 2024]['state'].unique()):
        print(v)

District of Columbia


Apply those results to the combined dataframe

In [127]:
for key in election_results.keys():
    if key != "District of Columbia":
        processed_data.loc[(processed_data['state'] == key) & (processed_data['cycle'] == 2024), 'winning_party'] = election_results[key]

Give it one final check with count

In [128]:
processed_data.count()

state                      316
polling_avg_republican     316
polling_avg_democrat       316
cycle                      316
jkey                       316
month_change_republican    316
month_change_democrat      316
winning_party              316
dtype: int64

**And nothing is missing, meaning, that we are finally done with processing the polling data.**