# Predicting 2022 US Senate Midterm Elections
### with Cook Partisan Voting Index (PVI) and Google Trends

In [1]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt



<b>data</b>: the historical election and Google Trends (GT) data<br>
<b>midterm</b>: GT data of midterm elections as of 2022-11-07<br>
<b>midterm8</b>: GT data of midterm elections as of 2022-11-08<br>
<b>actual</b>: Midterm election winners as of 2022-11-15

In [2]:
data = pd.read_excel('gt_pvi_incumbent.xlsx')
midterm = pd.read_excel('11_7_2022_gt.xlsx')
midterm8 = pd.read_excel('11_8_2022_gt.xlsx')
actual = pd.read_excel('winners2022-11-15.xlsx')

Removing data without two candidates, and placing the election and outcomes on one row.

In [3]:
data = data.loc[data['id'] != 'AL2014']
data = data.loc[data["id"] != 'SD2010']

In [4]:
# Change senators party to the party they caucus with
parties = ['DEMOCRAT', 'REPUBLICAN']
data = data.replace(to_replace={
    'LIBERTARIAN': 'REPUBLICAN',
    'OTHER': 'DEMOCRAT',
})

data.loc[data['candidate'] == 'REBEKAH KENNEDY', 'party'] = 'REPUBLICAN'
data.loc[data['candidate'] == 'CYNTHIA M. LUMMIS', 'party'] = 'REPUBLICAN'

# FOR MODEL PURPOSES ONLY

data.loc[data['candidate'] == 'LISA MURKOWSKI', 'party'] = 'DEMOCRAT'
data.loc[data['candidate'] == 'RICKY DALE HARRINGTON JR.', 'party'] = 'DEMOCRAT'
data.loc[data['candidate'] == 'KEVIN DE LEON', 'party'] = 'REPUBLICAN'
data.loc[data['candidate'] == 'LORETTA SANCHEZ', 'party'] = 'REPUBLICAN'
data.loc[data['candidate'] == 'JOSEPH I. LIEBERMAN', 'party'] = 'REPUBLICAN'

midterm.loc[midterm['candidate'] == 'LISA MURKOWSKI', 'party'] = 'DEMOCRAT'
midterm8.loc[midterm8['candidate'] == 'LISA MURKOWSKI', 'party'] = 'DEMOCRAT'

In [5]:
# Remove 2006 from Data
data = data.loc[data['year'] > 2006]

In [6]:
reps = data.loc[data['party'] == 'REPUBLICAN']
dems = data.loc[data['party'] == 'DEMOCRAT']

In [7]:
reps = reps.sort_values(by='id')
dems = dems.sort_values(by='id')

In [8]:
reps.columns

Index(['state', 'year', 'id', 'candidate', 'incumbent', 'party', 'percent',
       'winner', 'gt_7', 'gt_15', 'gt_30', 'favor_7', 'favor_15', 'favor_30',
       'search_term', 'pvi'],
      dtype='object')

In [9]:
reps = reps.rename(columns={
    'candidate': 'r_candidate',
    'incumbent': 'r_incumbent',
    'winner': 'r_winner',
    'gt_7': 'r_gt_7',
    'gt_15': 'r_gt_15',
    'gt_30': 'r_gt_30',
    'favor_7': 'r_favor_7',
    'favor_15': 'r_favor_15',
    'favor_30': 'r_favor_30',
})

In [10]:
dems = dems.rename(columns={
    'candidate': 'd_candidate',
    'incumbent': 'd_incumbent',
    'winner': 'd_winner',
    'gt_7': 'd_gt_7',
    'gt_15': 'd_gt_15',
    'gt_30': 'd_gt_30',
}).drop(columns=[
    'percent',
    'party',
    'favor_7',
    'favor_15',
    'favor_30',
    'search_term',
    'pvi',
])

In [11]:
data2 = pd.merge(reps, dems, on=['state', 'id', 'year'])

In [12]:
data2['outcome'] = np.where(data2['r_winner'] == True, 'REPUBLICAN', 'DEMOCRAT')

In [13]:
reps_m = midterm.loc[midterm['party'] == 'REPUBLICAN']
dems_m = midterm.loc[midterm['party'] == 'DEMOCRAT']

In [14]:
reps_m = reps_m.rename(columns={
    'candidate': 'r_candidate',
    'incumbent': 'r_incumbent',
    'winner': 'r_winner',
    'gt_7': 'r_gt_7',
    'gt_15': 'r_gt_15',
    'gt_30': 'r_gt_30',
    'favor_7': 'r_favor_7',
    'favor_15': 'r_favor_15',
    'favor_30': 'r_favor_30',
})

In [15]:
dems_m = dems_m.rename(columns={
    'candidate': 'd_candidate',
    'incumbent': 'd_incumbent',
    'winner': 'd_winner',
    'gt_7': 'd_gt_7',
    'gt_15': 'd_gt_15',
    'gt_30': 'd_gt_30',
}).drop(columns=[
    'party',
    'favor_7',
    'favor_15',
    'favor_30',
    'search_term',
    'pvi',
])

In [16]:
reps_m = reps_m.sort_values(by='id')
dems_m = dems_m.sort_values(by='id')

In [17]:
midterm2 = pd.merge(reps_m, dems_m, on=['state', 'id', 'year'])

In [18]:
reps_m8 = midterm8.loc[midterm8['party'] == 'REPUBLICAN']
dems_m8 = midterm8.loc[midterm8['party'] == 'DEMOCRAT']

In [19]:
reps_m8 = reps_m8.rename(columns={
    'candidate': 'r_candidate',
    'incumbent': 'r_incumbent',
    'winner': 'r_winner',
    'gt_7': 'r_gt_7',
    'gt_15': 'r_gt_15',
    'gt_30': 'r_gt_30',
    'favor_7': 'r_favor_7',
    'favor_15': 'r_favor_15',
    'favor_30': 'r_favor_30',
})

In [20]:
dems_m8 = dems_m8.rename(columns={
    'candidate': 'd_candidate',
    'incumbent': 'd_incumbent',
    'winner': 'd_winner',
    'gt_7': 'd_gt_7',
    'gt_15': 'd_gt_15',
    'gt_30': 'd_gt_30',
}).drop(columns=[
    'party',
    'favor_7',
    'favor_15',
    'favor_30',
    'search_term',
    'pvi',
])

In [21]:
reps_m8 = reps_m8.sort_values(by='id')
dems_m8 = dems_m8.sort_values(by='id')

In [22]:
midterm8 = pd.merge(reps_m8, dems_m8, on=['state', 'id', 'year'])

## Features
<b>r_gt_7, r_gt_15, r_gt_30:</b> Republican candidate's average GT score 7, 15, 30 days before the election 0-100.<br>
<b>d_gt_7, d_gt_15, d_gt_30:</b> Democrat candidate's average GT score 7, 15, 30 days before the election 0-100.<br>
<b>r_favor_7, r_favor_15, r_favor_30:</b> Republican candidate's GT score minus Democrat candidate's score<br>
<b>d_incumbent, r_incumbent:</b> True or False if there is a democrat or republican incumbent candidate<br>
<b>pvi:</b> Cook Partisan Voting Index<br>
<b>search_term</b>: Google Trends allows comparison between topics and raw search terms, sometimes candidates don't have a topic so search term was used instead https://support.google.com/trends/answer/4359550?hl=en

## Trying Different Features to see if they differ (they don't)

#### Search Term: True Included (Data & Midterm) as feature

In [23]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'd_incumbent', 'r_incumbent', 'search_term']
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm2[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

prediction = midterm2.copy()
prediction['outcome'] = rfc_sti.predict(X_predict)
prediction['winner'] = np.where(prediction['outcome'] == 'REPUBLICAN',
                               prediction['r_candidate'], prediction['d_candidate'])

In [24]:
prediction[['state','id', 'outcome', 'winner']]

Unnamed: 0,state,id,outcome,winner
0,ALASKA,AK2022,DEMOCRAT,LISA MURKOWSKI
1,ALABAMA,AL2022,REPUBLICAN,KATIE BRITT
2,ARKANSAS,AR2022,REPUBLICAN,JOHN BOOZMAN
3,ARIZONA,AZ2022,DEMOCRAT,MARK KELLY
4,CALIFORNIA,CA2022,DEMOCRAT,ALEX PADILLA
5,COLORADO,CO2022,DEMOCRAT,MICHAEL BENNET
6,CONNECTICUT,CT2022,DEMOCRAT,RICHARD BLUMENTHAL
7,FLORIDA,FL2022,REPUBLICAN,MARCO RUBIO
8,GEORGIA,GA2022,DEMOCRAT,RAPHAEL WARNOCK
9,HAWAII,HI2022,DEMOCRAT,BRIAN SCHATZ


#### Search Term: True Included (Not as a Feature)

In [25]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'd_incumbent', 'r_incumbent',]
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm2[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

prediction2 = midterm2.copy()
prediction2['outcome'] = rfc_sti.predict(X_predict)
prediction2['winner'] = np.where(prediction2['outcome'] == 'REPUBLICAN',
                               prediction2['r_candidate'], prediction2['d_candidate'])

In [26]:
prediction2[['state','id', 'outcome', 'winner']]

Unnamed: 0,state,id,outcome,winner
0,ALASKA,AK2022,DEMOCRAT,LISA MURKOWSKI
1,ALABAMA,AL2022,REPUBLICAN,KATIE BRITT
2,ARKANSAS,AR2022,REPUBLICAN,JOHN BOOZMAN
3,ARIZONA,AZ2022,DEMOCRAT,MARK KELLY
4,CALIFORNIA,CA2022,DEMOCRAT,ALEX PADILLA
5,COLORADO,CO2022,DEMOCRAT,MICHAEL BENNET
6,CONNECTICUT,CT2022,DEMOCRAT,RICHARD BLUMENTHAL
7,FLORIDA,FL2022,REPUBLICAN,MARCO RUBIO
8,GEORGIA,GA2022,DEMOCRAT,RAPHAEL WARNOCK
9,HAWAII,HI2022,DEMOCRAT,BRIAN SCHATZ


In [27]:
prediction['outcome'] == prediction2['outcome']

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
20    True
21    True
22    True
23    True
24    True
25    True
26    True
27    True
28    True
29    True
30    True
31    True
32    True
Name: outcome, dtype: bool

### Search Term: True Not Included

In [28]:
data_ns = data2.loc[data2['search_term'] == False]
midterm_ns = midterm2.loc[midterm2['search_term'] == False]

In [29]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'd_incumbent', 'r_incumbent',]
target = 'outcome'

X_train = data_ns[features]
y_train = data_ns[target]
X_predict = midterm_ns[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

prediction3 = midterm_ns.copy()
prediction3['outcome'] = rfc_sti.predict(X_predict)
prediction3['winner'] = np.where(prediction3['outcome'] == 'REPUBLICAN',
                               prediction3['r_candidate'], prediction3['d_candidate'])

In [30]:
prediction3[['state','id', 'outcome', 'winner']]

Unnamed: 0,state,id,outcome,winner
0,ALASKA,AK2022,DEMOCRAT,LISA MURKOWSKI
3,ARIZONA,AZ2022,DEMOCRAT,MARK KELLY
4,CALIFORNIA,CA2022,DEMOCRAT,ALEX PADILLA
7,FLORIDA,FL2022,REPUBLICAN,MARCO RUBIO
8,GEORGIA,GA2022,DEMOCRAT,RAPHAEL WARNOCK
9,HAWAII,HI2022,DEMOCRAT,BRIAN SCHATZ
10,IOWA,IA2022,REPUBLICAN,CHUCK GRASSLEY
13,INDIANA,IN2022,REPUBLICAN,TODD YOUNG
14,KANSAS,KS2022,REPUBLICAN,JERRY MORAN
15,KENTUCKY,KY2022,REPUBLICAN,RAND PAUL


In [31]:
rfc_sti.classes_

array(['DEMOCRAT', 'REPUBLICAN'], dtype=object)

## Predicting the Election using "midterm8"
No change from "midterm"

In [32]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'd_incumbent', 'r_incumbent', 'search_term']
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

prediction4 = midterm8.copy()
prediction4['outcome'] = rfc_sti.predict(X_predict)
prediction4['winner'] = np.where(prediction4['outcome'] == 'REPUBLICAN',
                               prediction4['r_candidate'], prediction4['d_candidate'])

In [33]:
prediction4[['state','id', 'outcome', 'winner']]

Unnamed: 0,state,id,outcome,winner
0,ALASKA,AK2022,DEMOCRAT,LISA MURKOWSKI
1,ALABAMA,AL2022,REPUBLICAN,KATIE BRITT
2,ARKANSAS,AR2022,REPUBLICAN,JOHN BOOZMAN
3,ARIZONA,AZ2022,DEMOCRAT,MARK KELLY
4,CALIFORNIA,CA2022,DEMOCRAT,ALEX PADILLA
5,COLORADO,CO2022,DEMOCRAT,MICHAEL BENNET
6,CONNECTICUT,CT2022,DEMOCRAT,RICHARD BLUMENTHAL
7,FLORIDA,FL2022,REPUBLICAN,MARCO RUBIO
8,GEORGIA,GA2022,DEMOCRAT,RAPHAEL WARNOCK
9,HAWAII,HI2022,DEMOCRAT,BRIAN SCHATZ


# Accuracy of the Prediction
Although as of 2022-11-15 the Alaska election has not been fully counted. Another issue with Alaska is we do not have the candidates accurately classified for simplicity of the model. The Georgia election has also not been called. But Raphael Warnock recieved a higher percentage of the vote, which the model is testing.

In [34]:
(prediction4[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.969697
False    0.030303
Name: winner, dtype: float64

In [35]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('pvi', 0.25881),
 ('d_incumbent', 0.19129),
 ('r_incumbent', 0.0972),
 ('r_favor_30', 0.071),
 ('r_gt_7', 0.06301),
 ('r_favor_7', 0.05136),
 ('d_gt_7', 0.04766),
 ('r_gt_30', 0.04606),
 ('d_gt_15', 0.04409),
 ('r_favor_15', 0.04303),
 ('d_gt_30', 0.04301),
 ('r_gt_15', 0.03798),
 ('search_term', 0.00552)]

## Comparison of PVI, Incumbent, and Google Trends as Features in Prediction
It appears that the GT had no affect on the accuracy of the prediction, as the model makes the same prediction without that data.

#### Full

In [36]:
full = prediction4.copy()
full['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

full['correct'] = np.where(full['winner'] == actual['winner'], True, False)
full = full.set_index('state')[['winner', 'probability', 'correct']]

#### PVI Only

In [37]:
features = ['pvi']
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

pvi = midterm8.copy()
pvi['outcome'] = rfc_sti.predict(X_predict)
pvi['winner'] = np.where(pvi['outcome'] == 'REPUBLICAN',
                               pvi['r_candidate'], pvi['d_candidate'])

In [38]:
(pvi[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.909091
False    0.090909
Name: winner, dtype: float64

In [39]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('pvi', 1.0)]

In [40]:
#pvi['probability'] = rfc_sti.predict_proba(X_predict)
pvi['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

pvi['correct'] = np.where(pvi['winner'] == actual['winner'], True, False)
pvi = pvi.set_index('state')[['winner', 'probability', 'correct']]

#### PVI & Incumbent

In [41]:
features = ['pvi', 'd_incumbent', 'r_incumbent',]
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

pvi_inc = midterm8.copy()
pvi_inc['outcome'] = rfc_sti.predict(X_predict)
pvi_inc['winner'] = np.where(pvi_inc['outcome'] == 'REPUBLICAN',
                               pvi_inc['r_candidate'], pvi_inc['d_candidate'])

In [42]:
(pvi_inc[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.969697
False    0.030303
Name: winner, dtype: float64

In [43]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('pvi', 0.59247), ('d_incumbent', 0.24604), ('r_incumbent', 0.16149)]

In [44]:
pvi_inc['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

pvi_inc['correct'] = np.where(pvi_inc['winner'] == actual['winner'], True, False)
pvi_inc = pvi_inc.set_index('state')[['winner', 'probability', 'correct']]

#### Google Trends

In [45]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'search_term']
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

trends = midterm8.copy()
trends['outcome'] = rfc_sti.predict(X_predict)
trends['winner'] = np.where(trends['outcome'] == 'REPUBLICAN',
                               trends['r_candidate'], trends['d_candidate'])

In [46]:
(trends[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

False    0.545455
True     0.454545
Name: winner, dtype: float64

In [47]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('r_favor_7', 0.13098),
 ('r_gt_7', 0.12869),
 ('r_favor_30', 0.12543),
 ('r_favor_15', 0.10985),
 ('r_gt_30', 0.10745),
 ('r_gt_15', 0.1003),
 ('d_gt_7', 0.09548),
 ('d_gt_15', 0.09419),
 ('d_gt_30', 0.09038),
 ('search_term', 0.01725)]

In [48]:
trends['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

trends['correct'] = np.where(trends['winner'] == actual['winner'], True, False)
trends = trends.set_index('state')[['winner', 'probability', 'correct']]

#### Google Trends + Incumbent

In [49]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'd_incumbent', 'r_incumbent', 'search_term']
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

gt_inc = midterm8.copy()
gt_inc['outcome'] = rfc_sti.predict(X_predict)
gt_inc['winner'] = np.where(gt_inc['outcome'] == 'REPUBLICAN',
                               gt_inc['r_candidate'], gt_inc['d_candidate'])

In [50]:
(gt_inc[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.939394
False    0.060606
Name: winner, dtype: float64

In [51]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('d_incumbent', 0.22173),
 ('r_incumbent', 0.11263),
 ('r_favor_30', 0.08247),
 ('r_favor_7', 0.07918),
 ('r_gt_7', 0.07853),
 ('r_favor_15', 0.0755),
 ('d_gt_30', 0.07062),
 ('d_gt_15', 0.07052),
 ('r_gt_15', 0.06954),
 ('r_gt_30', 0.06904),
 ('d_gt_7', 0.06263),
 ('search_term', 0.00761)]

In [52]:
gt_inc['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

gt_inc['correct'] = np.where(gt_inc['winner'] == actual['winner'], True, False)
gt_inc = gt_inc.set_index('state')[['winner', 'probability', 'correct']]

#### Google Trends + PVI

In [53]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'search_term']
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

gt_pvi = midterm8.copy()
gt_pvi['outcome'] = rfc_sti.predict(X_predict)
gt_pvi['winner'] = np.where(gt_pvi['outcome'] == 'REPUBLICAN',
                               gt_pvi['r_candidate'], gt_pvi['d_candidate'])

In [54]:
(gt_pvi[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.878788
False    0.121212
Name: winner, dtype: float64

In [55]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('pvi', 0.42499),
 ('r_gt_7', 0.08138),
 ('r_favor_7', 0.07323),
 ('r_favor_30', 0.07096),
 ('r_gt_30', 0.06692),
 ('d_gt_30', 0.06345),
 ('r_favor_15', 0.05962),
 ('d_gt_7', 0.0516),
 ('r_gt_15', 0.05059),
 ('d_gt_15', 0.04794),
 ('search_term', 0.00932)]

In [56]:
gt_pvi['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

gt_pvi['correct'] = np.where(gt_pvi['winner'] == actual['winner'], True, False)
gt_pvi = gt_pvi.set_index('state')[['winner', 'probability', 'correct']]

#### Incumbent Only

In [57]:
features = ['d_incumbent', 'r_incumbent',]
target = 'outcome'

X_train = data2[features]
y_train = data2[target]
X_predict = midterm8[features]


rfc_sti = RandomForestClassifier(n_estimators=100, random_state=2, min_samples_split=2, max_depth=12)
rfc_sti.fit(X_train, y_train)

inc = midterm8.copy()
inc['outcome'] = rfc_sti.predict(X_predict)
inc['winner'] = np.where(inc['outcome'] == 'REPUBLICAN',
                               inc['r_candidate'], inc['d_candidate'])

In [58]:
(inc[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.909091
False    0.090909
Name: winner, dtype: float64

In [59]:
importance = zip(features, [round(i, 5) for i in rfc_sti.feature_importances_])
importance = sorted(importance, key = lambda x: x[1], reverse=True)
importance

[('d_incumbent', 0.62097), ('r_incumbent', 0.37903)]

In [60]:
(inc[['state','id', 'outcome', 'winner']] == actual)[
    'winner'].value_counts(normalize=True)

True     0.909091
False    0.090909
Name: winner, dtype: float64

In [61]:
inc['probability'] = rfc_sti.predict_proba(X_predict).max(axis=1)

inc['correct'] = np.where(inc['winner'] == actual['winner'], True, False)
inc = inc.set_index('state')[['winner', 'probability', 'correct']]

### Probabilites of Full Model vs. PVI & Incumbent Only

In [62]:
column_list = ['full', 'full', 'full', 'pvi', 'pvi', 'pvi', 'pvi_inc', 'pvi_inc', 'pvi_inc',
              'trends', 'trends', 'trends', 'gt_inc', 'gt_inc', 'gt_inc',
              'gt_pvi', 'gt_pvi', 'gt_pvi', 'inc', 'inc', 'inc', 'actual'] 
proba = pd.concat([full, pvi, pvi_inc, trends, gt_inc, gt_pvi, inc, actual.set_index('state')['winner']], axis=1)
proba.columns = pd.MultiIndex.from_arrays([column_list, proba.columns])
proba[['full', 'pvi_inc', 'actual']]

Unnamed: 0_level_0,full,full,full,pvi_inc,pvi_inc,pvi_inc,actual
Unnamed: 0_level_1,winner,probability,correct,winner,probability,correct,winner
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ALASKA,LISA MURKOWSKI,0.56,False,LISA MURKOWSKI,1.0,False,KELLY TSHIBAKA
ALABAMA,KATIE BRITT,0.82,True,KATIE BRITT,1.0,True,KATIE BRITT
ARKANSAS,JOHN BOOZMAN,0.87,True,JOHN BOOZMAN,1.0,True,JOHN BOOZMAN
ARIZONA,MARK KELLY,0.71,True,MARK KELLY,0.522214,True,MARK KELLY
CALIFORNIA,ALEX PADILLA,0.84,True,ALEX PADILLA,1.0,True,ALEX PADILLA
COLORADO,MICHAEL BENNET,0.8,True,MICHAEL BENNET,1.0,True,MICHAEL BENNET
CONNECTICUT,RICHARD BLUMENTHAL,0.8,True,RICHARD BLUMENTHAL,1.0,True,RICHARD BLUMENTHAL
FLORIDA,MARCO RUBIO,0.81,True,MARCO RUBIO,0.671064,True,MARCO RUBIO
GEORGIA,RAPHAEL WARNOCK,0.59,True,RAPHAEL WARNOCK,0.577167,True,RAPHAEL WARNOCK
HAWAII,BRIAN SCHATZ,0.89,True,BRIAN SCHATZ,1.0,True,BRIAN SCHATZ


In [63]:
comp = [
    'ALASKA',
    'COLORADO',
    'GEORGIA',
    'NEVADA',
    'NEW HAMPSHIRE',
    'NORTH CAROLINA',
    'OHIO',
    'PENNSYLVANIA',
    'WISCONSIN'
]
#proba.loc[proba.index.isin(comp), (['full', 'pvi_inc', 'actual'], ['winner', 'probability'])].to_excel(
    #'cross-tab.xlsx', index=True)
proba.loc[proba.index.isin(comp), (['full', 'pvi_inc', 'actual'], ['winner', 'probability'])]

Unnamed: 0_level_0,full,full,pvi_inc,pvi_inc,actual
Unnamed: 0_level_1,winner,probability,winner,probability,winner
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ALASKA,LISA MURKOWSKI,0.56,LISA MURKOWSKI,1.0,KELLY TSHIBAKA
COLORADO,MICHAEL BENNET,0.8,MICHAEL BENNET,1.0,MICHAEL BENNET
GEORGIA,RAPHAEL WARNOCK,0.59,RAPHAEL WARNOCK,0.577167,RAPHAEL WARNOCK
NORTH CAROLINA,TED BUDD,0.61,TED BUDD,0.7865,TED BUDD
NEW HAMPSHIRE,MAGGIE HASSAN,0.65,MAGGIE HASSAN,0.900461,MAGGIE HASSAN
NEVADA,CATHERINE CORTEZ MASTO,0.82,CATHERINE CORTEZ MASTO,0.9825,CATHERINE CORTEZ MASTO
OHIO,JD VANCE,0.7,JD VANCE,0.93,JD VANCE
PENNSYLVANIA,JOHN FETTERMAN,0.61,JOHN FETTERMAN,0.586619,JOHN FETTERMAN
WISCONSIN,RON JOHNSON,0.57,RON JOHNSON,0.862098,RON JOHNSON


## Testing the Different Models on Historical Data
When the model was repeatedly tested on historical data, including the GT data showed a small but significant improvement from the model using only PVI and incumbent features.

In [64]:
features = ['pvi',]
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [65]:
print(f'PVI Only Accuracy: {np.array(accuracy).mean()}')

PVI Only Accuracy: 0.8055882352941177


#### PVI & Incumbent

In [66]:

features = ['pvi', 'd_incumbent', 'r_incumbent',]
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [67]:
pvi_inc_score = np.array(accuracy)
print(f'PVI & Incumbent Accuracy: {pvi_inc_score.mean()}')

PVI & Incumbent Accuracy: 0.83


#### Google Trends Only

In [68]:

features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'search_term']
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [69]:
print(f'Google Trends Only Accuracy: {np.array(accuracy).mean()}')

Google Trends Only Accuracy: 0.645


#### Google Trends + Incumbent

In [70]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'd_incumbent', 'r_incumbent', 'search_term']
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [71]:
print(f'Google Trends + Incumbent Accuracy: {np.array(accuracy).mean()}')

Google Trends + Incumbent Accuracy: 0.7932352941176471


#### Google Trends + PVI

In [72]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'search_term']
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [73]:
print(f'Google Trends + PVI Accuracy: {np.array(accuracy).mean()}')

Google Trends + PVI Accuracy: 0.8226470588235295


#### Incumbent Only

In [74]:
features = ['d_incumbent', 'r_incumbent',]
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [75]:
print(f"Incumbent Only Accuracy: {np.array(accuracy).mean()}")

Incumbent Only Accuracy: 0.8220588235294117


#### Full Model

In [76]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'd_incumbent', 'r_incumbent', 'search_term']
target = 'outcome'

X = data2[features]
y = data2[target]

accuracy = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    accuracy.append((hist1['outcome'] == y_test).value_counts(normalize=True)[1])

In [77]:
full_model_score = np.array(accuracy)
print(f'Full Feature Accuracy: {full_model_score.mean()}')

Full Feature Accuracy: 0.8702941176470587


In [78]:
results = stats.ttest_ind(pvi_inc_score, full_model_score)
print(f'p-value: {results.pvalue:.5f}')

p-value: 0.00000


## Which Elections Did the Models Gets Wrong?

#### PVI & Incumbent

In [79]:
features = ['pvi', 'd_incumbent', 'r_incumbent',]
target = 'outcome'

X = data2.set_index('id')[features]
y = data2.set_index('id')[target]

elections = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    for ID in (hist1['outcome'] == y_test).loc[(hist1['outcome'] == y_test) == False].index.to_list():
        elections.append(ID)

In [80]:
model_p = dict(Counter(sorted(elections)))

#### Full Model

In [81]:
features = ['r_gt_7', 'r_gt_15', "r_gt_30", 'r_favor_7', 'r_favor_15', 'r_favor_30',
            'd_gt_7', 'd_gt_15', 'd_gt_30', 'pvi', 'd_incumbent', 'r_incumbent', 'search_term']
target = 'outcome'

X = data2.set_index('id')[features]
y = data2.set_index('id')[target]

elections = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.137, random_state=i)
    rfc = RandomForestClassifier(n_estimators=100, random_state=i, min_samples_split=2, max_depth=12)
    rfc.fit(X_train, y_train)

    hist1 = X_test.copy()
    hist1['outcome'] = rfc.predict(X_test)
    for ID in (hist1['outcome'] == y_test).loc[(hist1['outcome'] == y_test) == False].index.to_list():
        elections.append(ID)

In [82]:
model_f = dict(Counter(sorted(elections)))

Creating a list of election 'id's where the Full Model (google trends, pvi, incumbent features) performed better: `model_f_better`<br>
and only the PVI and Incumbent Model: `model_p_better` to analyze the difference in the mean and variance of their features

In [83]:
only_wrong_in_model_f = defaultdict()
model_f_better = []
for ID in model_f:
    if ID in model_p:
        #print(ID, model_f[ID], model_p[ID])
        if model_f[ID] < model_p[ID]:
            model_f_better.append(ID)
    else:
        only_wrong_in_model_f[ID] = model_f[ID]
print(model_f_better)        
print(only_wrong_in_model_f)

['AK2016', 'AR2014', 'AZ2020', 'FL2010', 'GA2021a', 'GA2021b', 'IA2014', 'IL2016', 'IN2018', 'LA2008', 'MN2008', 'MO2010', 'MO2018', 'NC2014', 'NV2012', 'NV2018', 'OH2010', 'PA2016']
defaultdict(None, {'CO2020': 4, 'MI2020': 7, 'MO2012': 18, 'MS2008b': 1, 'NC2020': 11, 'NM2012': 1, 'OH2012': 1, 'TX2020': 1, 'VA2012': 14})


In [84]:
only_wrong_in_model_p = defaultdict()
model_p_better = []
for ID in model_p:
    if ID in model_f:
        if model_p[ID] < model_f[ID]:
            model_p_better.append(ID)
    else:
        only_wrong_in_model_p[ID] = model_p[ID]
print(model_p_better)        
print(only_wrong_in_model_p)

['AK2014', 'AR2010', 'AZ2018', 'IN2012', 'LA2014', 'ME2014', 'MT2008', 'MT2012', 'MT2018', 'NC2016', 'ND2018', 'NH2008', 'NH2010', 'VA2008', 'WV2010', 'WV2012']
defaultdict(None, {'AK2010': 2, 'AR2008': 9, 'FL2012': 11, 'FL2016': 1, 'IA2010': 17, 'IA2016': 2, 'IN2010': 1, 'MA2012': 1, 'ME2008': 2, 'MN2012': 4, 'MN2014': 2, 'MN2020': 4, 'NC2010': 14, 'NM2008': 7, 'NV2016': 4, 'OH2016': 3, 'OH2018': 11, 'SD2008': 12, 'WI2012': 19, 'WI2016': 3, 'WV2008': 1, 'WV2018': 13})


In [85]:
for ID in only_wrong_in_model_f:
    model_p_better.append(ID)
for ID in only_wrong_in_model_p:
    model_f_better.append(ID)

In [86]:
# Using the Absolute Value of the GT favor
data3 = data2.copy()
data3['r_favor_7'] = data3['r_favor_7'].abs()
data3['r_favor_15'] = data3['r_favor_15'].abs()
data3['r_favor_30'] = data3['r_favor_30'].abs()
data3['pvi'] = data3['pvi'].abs()

midterm8_abs = midterm8.copy()
midterm8_abs['r_favor_7'] = midterm8_abs['r_favor_7'].abs()
midterm8_abs['r_favor_15'] = midterm8_abs['r_favor_15'].abs()
midterm8_abs['r_favor_30'] = midterm8_abs['r_favor_30'].abs()
midterm8_abs['pvi'] = midterm8_abs['pvi'].abs()

#### Mean

In [87]:
pd.concat([
    data3.mean().rename('historical'),
    data3.loc[data3['id'].isin(model_f_better)].mean().rename('model_f_better'),
    data3.loc[data3['id'].isin(model_p_better)].mean().rename('model_p_better'),
    data3.loc[data3['id'].isin([ID for ID in only_wrong_in_model_f])].mean().rename('only_wrong_model_f'),
    data3.loc[data3['id'].isin([ID for ID in only_wrong_in_model_p])].mean().rename('only_wrong_model_p'),
    midterm8_abs.mean().rename('midterm8'),
], axis=1).drop(index=['percent', 'r_winner', 'd_winner'])

Unnamed: 0,historical,model_f_better,model_p_better,only_wrong_model_f,only_wrong_model_p,midterm8
year,2013.991701,2013.55,2013.52,2015.111111,2012.636364,2022.0
r_incumbent,0.356846,0.375,0.24,0.333333,0.363636,0.393939
r_gt_7,28.219917,30.15,32.36,39.0,23.5,17.333333
r_gt_15,21.53112,22.9,27.36,37.111111,18.772727,37.939394
r_gt_30,16.53527,17.35,21.6,28.444444,15.681818,24.545455
r_favor_7,15.06639,15.625,16.72,18.444444,17.818182,6.575758
r_favor_15,10.435685,12.0,9.76,11.333333,13.272727,18.575758
r_favor_30,7.809129,9.275,7.72,7.111111,11.227273,11.787879
search_term,0.20332,0.075,0.08,0.111111,0.090909,0.424242
pvi,8.680498,4.7,6.12,3.666667,4.863636,9.363636


#### Variance

In [88]:
pd.concat([
    data3.var().rename('historical'),
    data3.loc[data3['id'].isin(model_f_better)].var().rename('model_f_better'),
    data3.loc[data3['id'].isin(model_p_better)].var().rename('model_p_better'),
    data3.loc[data3['id'].isin([ID for ID in only_wrong_in_model_f])].var().rename('only_wrong_model_f'),
    data3.loc[data3['id'].isin([ID for ID in only_wrong_in_model_p])].var().rename('only_wrong_model_p'),
    midterm8_abs.var().rename('midterm8'),
], axis=1).drop(index=['percent', 'r_winner', 'd_winner'])

Unnamed: 0,historical,model_f_better,model_p_better,only_wrong_model_f,only_wrong_model_p,midterm8
year,16.374931,16.869231,17.093333,23.111111,14.623377,0.0
r_incumbent,0.230463,0.240385,0.19,0.25,0.242424,0.246212
r_gt_7,215.713935,252.028205,276.99,535.0,246.738095,85.041667
r_gt_15,134.308402,145.989744,179.406667,241.361111,170.945887,394.933712
r_gt_30,73.699793,77.464103,127.25,194.777778,109.465368,190.130682
r_favor_7,128.528907,154.804487,105.46,118.527778,223.203463,32.001894
r_favor_15,70.571888,100.974359,59.356667,68.25,137.445887,151.064394
r_favor_30,40.980083,61.537821,37.626667,42.861111,84.183983,98.672348
search_term,0.162656,0.071154,0.076667,0.111111,0.08658,0.251894
pvi,33.993326,19.138462,19.276667,9.0,22.218615,33.926136
