This notebook analyzes predictions for only the 15 battleground states in the 2020 election. 

In [58]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sb

%matplotlib inline

In [59]:
BATTLEGROUNDS = {
    'AZ': 11,
    'FL': 29,
    'GA': 16,
    'IA': 6,
    'ME2': 1, 
    'MI': 16,
    'ME': 2,
    'NE2': 1,
    'NV': 6,
    'NH': 4,
    'NC': 15,
    'OH': 18,
    'PA': 20,
    'TX': 38,
    'WI': 10
}

def get_evs(abbr):
    return BATTLEGROUNDS.get(abbr)

In [60]:
WINNERS_PATH = os.path.join(DATA_DIR, 'winners.csv')
winners = pd.read_csv(WINNERS_PATH)
evs_called = pd.Series(winners[
    (winners["office"] == "P") &
    (winners["winner"].notnull())
]["state"].unique()).apply(get_evs).sum()

In [61]:
winners.head()

Unnamed: 0,office,state,winner,dem_diff,dem_share,dem_share_2p,candidates
0,P,AZ,Biden,0.003,0.494,0.501523,BIDEN|TRUMP
1,P,FL,Trump,-0.034,0.479,0.48335,BIDEN|TRUMP
2,P,GA,Biden,0.003,0.495,0.501012,BIDEN|TRUMP
3,P,IA,Trump,-0.082,0.449,0.458163,BIDEN|TRUMP
4,P,ME2,Trump,-0.079,0.447,0.458932,BIDEN|TRUMP


In [62]:
DATA_DIR = "./597E-assignment1/data"

In [63]:
source_dfs = []
for fn in os.listdir(os.path.join(DATA_DIR, 'state-level')):
    if 'uva' in fn:
        continue
    print(fn)
    df = pd.read_csv(os.path.join(DATA_DIR, 'state-level', fn))
    source_dfs.append(df)
state_forecasts = pd.concat(source_dfs, axis=0)
state_forecasts.shape

pollyvote.csv
fivethirtyeight.csv
northwestern.battleground_only.csv
economist.csv
pec.csv


(18106, 10)

In [64]:
state_forecasts = state_forecasts[
    (state_forecasts["date"] == "2020-11-03") &
    (state_forecasts["office"] == 'P') &
    (state_forecasts['state'].isin(BATTLEGROUNDS.keys()))]
state_forecasts

Unnamed: 0,date,model,office,state,party,candidate,win_prob,est_diff,est_share,est_share_2p
2,2020-11-03,pollyvote,P,NH,D,Biden,0.87,,,
3,2020-11-03,pollyvote,P,NH,R,Trump,0.13,,,
4,2020-11-03,pollyvote,P,ME,D,Biden,0.95,,,
5,2020-11-03,pollyvote,P,ME,R,Trump,0.05,,,
6,2020-11-03,pollyvote,P,NV,D,Biden,0.82,,,
...,...,...,...,...,...,...,...,...,...,...
87,2020-11-03,pec,P,PA,R,Trump,0.11,-0.050,,
96,2020-11-03,pec,P,TX,D,Biden,0.21,-0.027,,
97,2020-11-03,pec,P,TX,R,Trump,0.79,0.027,,
106,2020-11-03,pec,P,WI,D,Biden,0.95,0.085,,


In [65]:
state_forecasts['model'].unique()

array(['pollyvote', 'fivethirtyeight-polls-plus', 'northwestern',
       'economist', 'pec'], dtype=object)

In [66]:
def get_credits(race):
    max_prob = race["win_prob"].max()
    at_max = race["win_prob"] == max_prob
    favorites = race[at_max]
    credit = (1 / len(favorites)) * favorites["correct"]
    return credit.sum()

def brier_score_race_statelevel(called_forecast):
    uniques = called_forecast[[
        "date", "office", "state", "model"
    ]].apply(lambda x: x.nunique())
    assert((uniques != 1).sum() == 0)
    errors = called_forecast["win_prob"] - called_forecast["correct"]
    errors_squared = (errors).pow(2).sum()
    # If you didn't put odds on the candidate, add 1
    if called_forecast["correct"].sum() == 0:
        errors_squared += 1
    return errors_squared / 2  # divide by 2 bc there are 2 forecasts: Biden and Trump

In [67]:
def forecast_scores_statelevel(forecasts):
    called = pd.merge(winners, forecasts, on=['office', 'state'], how='left').dropna(
        subset=['winner'])
    called['correct'] = called['winner'] == called['candidate']
    called = called[~called['win_prob'].isnull()]
    
    scores = pd.merge(
        called, forecasts[['date', 'model']].drop_duplicates(),
        how='inner', on=['date', 'model']
    ).groupby([
        'date', 'office', 'state', 'model']).apply(brier_score_race_statelevel)\
            .reset_index()\
            .rename(columns={0: 'brier_score'})
    scores.loc[((scores['office'] == 'P')), 'brier_evs'] = scores.apply(
        lambda x: x['brier_score'] * get_evs(x['state']), axis=1)
    credits = pd.DataFrame({
        "credit": called.groupby([
                "date", "model", "office", "state"
            ]).apply(get_credits)
    }).reset_index()
    scores = pd.DataFrame({
        "pres_brier_flat": scores.groupby("model")["brier_score"].mean(),
        "pres_brier_evs": scores.groupby("model")["brier_evs"].sum() / evs_called,
        "pres_correct": credits[credits["office"] == "P"].groupby("model")["credit"].sum(),
    })
    return scores
    

In [68]:
forecast_scores_statelevel(state_forecasts)

Unnamed: 0_level_0,pres_brier_flat,pres_brier_evs,pres_correct
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
economist,0.143917,0.195667,11.0
fivethirtyeight-polls-plus,0.133959,0.180526,11.0
northwestern,0.153107,0.204028,10.0
pec,0.148807,0.174368,12.0
pollyvote,0.121027,0.154699,11.0


In [55]:
forecasts = state_forecasts[state_forecasts['model'] == 'northwestern']
called = pd.merge(winners, forecasts, on=['office', 'state'], how='left').dropna(
        subset=['winner'])
called['correct'] = called['winner'] == called['candidate']
called[~called['win_prob'].isnull()]

Unnamed: 0,office,state,winner,dem_diff,dem_share,dem_share_2p,candidates,date,model,party,candidate,win_prob,est_diff,est_share,est_share_2p,correct
0,P,AZ,Biden,0.003,0.494,0.501523,BIDEN|TRUMP,2020-11-03,northwestern,D,Biden,0.769,,,,True
1,P,AZ,Biden,0.003,0.494,0.501523,BIDEN|TRUMP,2020-11-03,northwestern,R,Biden,0.231,,,,True
2,P,FL,Trump,-0.034,0.479,0.48335,BIDEN|TRUMP,2020-11-03,northwestern,D,Biden,0.769,,,,False
3,P,FL,Trump,-0.034,0.479,0.48335,BIDEN|TRUMP,2020-11-03,northwestern,R,Biden,0.231,,,,False
4,P,GA,Biden,0.003,0.495,0.501012,BIDEN|TRUMP,2020-11-03,northwestern,D,Biden,0.785,,,,True
5,P,GA,Biden,0.003,0.495,0.501012,BIDEN|TRUMP,2020-11-03,northwestern,R,Biden,0.215,,,,True
6,P,IA,Trump,-0.082,0.449,0.458163,BIDEN|TRUMP,2020-11-03,northwestern,R,Trump,0.651,,,,True
7,P,IA,Trump,-0.082,0.449,0.458163,BIDEN|TRUMP,2020-11-03,northwestern,D,Trump,0.349,,,,True
9,P,MI,Biden,0.028,0.506,0.514228,BIDEN|TRUMP,2020-11-03,northwestern,D,Biden,0.9,,,,True
10,P,MI,Biden,0.028,0.506,0.514228,BIDEN|TRUMP,2020-11-03,northwestern,R,Biden,0.1,,,,True


In [40]:
forecasts = state_forecasts[state_forecasts['model'] == 'pec']
forecasts.shape

(30, 10)

In [41]:
forecasts

Unnamed: 0,date,model,office,state,party,candidate,win_prob,est_diff,est_share,est_share_2p
6,2020-11-03,pec,P,AZ,D,BIDEN,0.8,0.03,,
7,2020-11-03,pec,P,AZ,R,TRUMP,0.2,-0.03,,
18,2020-11-03,pec,P,FL,D,BIDEN,0.81,0.03,,
19,2020-11-03,pec,P,FL,R,TRUMP,0.19,-0.03,,
20,2020-11-03,pec,P,GA,D,BIDEN,0.64,0.015,,
21,2020-11-03,pec,P,GA,R,TRUMP,0.36,-0.015,,
24,2020-11-03,pec,P,IA,D,BIDEN,0.38,-0.01,,
25,2020-11-03,pec,P,IA,R,TRUMP,0.62,0.01,,
42,2020-11-03,pec,P,ME,D,BIDEN,1.0,0.13,,
43,2020-11-03,pec,P,ME,R,TRUMP,0.0,-0.13,,
