In [3]:
import pandas as pd
import csv
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
%matplotlib inline
# from paramsearch import paramsearch
from itertools import product,chain

## Data Aggregation

#### Prepping Files 

Reading the CSV files

In [27]:
hm2016 = pd.read_csv('federalelections2016 (2).csv')
hm2014 = pd.read_csv('federalelections2014.csv')
hm2012 = pd.read_csv('Senate_Primary_2012.csv')
hm2010 = pd.read_csv('Primary2010.csv')
hm2008 = pd.read_csv('Primary2008.csv')
hm2006 = pd.read_csv('Primary2006.csv')
hm2004 = pd.read_csv('FederalElection2004.csv')

**Adding the year to each CSV for easier analysis**

In [28]:
hm2016['YEAR'] = '2016'
hm2014['YEAR'] = '2014'
hm2012['YEAR'] = '2012'
hm2010['YEAR'] = '2010'
hm2008['YEAR'] = '2008'
hm2006['YEAR'] = '2006'
hm2004['YEAR'] = '2004'

In [31]:
hm2016.head()

Unnamed: 0,STATE,(I),CANDIDATE NAME,TOTAL VOTES,PARTY,PRIMARY VOTES,GENERAL VOTES,YEAR
0,,,,,,,,2016
1,Alabama,(I),"Shelby, Richard C.",,R,505586.0,1335104.0,2016
2,Alabama,,"McConnell, Jonathan",,R,214770.0,,2016
3,Alabama,,"Martin, John",,R,23558.0,,2016
4,Alabama,,"Bowman, Marcus",,R,19707.0,,2016


**Creating IDs I will later merge on**

In [32]:
hm2016["ID"] = hm2016["STATE"].map(str) + hm2016["YEAR"] +hm2016["PARTY"]
hm2016["ID2"] = hm2016["STATE"].map(str) + hm2016["YEAR"]

hm2014["ID"] = hm2014["STATE"].map(str) + hm2014["YEAR"] +hm2014["PARTY"]
hm2014["ID2"] = hm2014["STATE"].map(str) + hm2014["YEAR"]

hm2012["ID"] = hm2012["STATE"].map(str) + hm2012["YEAR"] +hm2012["PARTY"]
hm2012["ID2"] = hm2012["STATE"].map(str) + hm2012["YEAR"]

hm2010["ID"] = hm2010["STATE"].map(str) + hm2010["YEAR"] +hm2010["PARTY"]
hm2010["ID2"] = hm2010["STATE"].map(str) + hm2010["YEAR"]

hm2008["ID"] = hm2008["STATE"].map(str) + hm2008["YEAR"] +hm2008["PARTY"]
hm2008["ID2"] = hm2008["STATE"].map(str) + hm2008["YEAR"]

hm2006["ID"] = hm2006["STATE"].map(str) + hm2006["YEAR"] +hm2006["PARTY"]
hm2006["ID2"] = hm2006["STATE"].map(str) + hm2006["YEAR"]

hm2004["ID"] = hm2004["STATE"].map(str) + hm2004["YEAR"] +hm2004["PARTY"]
hm2004["ID2"] = hm2004["STATE"].map(str) + hm2004["YEAR"]


**Dropping weird columns & fixing weird merging errors**

In [33]:
hm2014 = hm2014.drop('Unnamed: 8', axis=1).drop('Unnamed: 9',axis=1)

In [34]:
hm2012.columns = [col.strip() for col in hm2012.columns]

In [35]:
hm2014.columns = [col.strip() for col in hm2014.columns]

In [37]:
hm2016.columns = [col.strip() for col in hm2016.columns]

**Append files together**

In [39]:
## hm is now the master df
hm = hm2016.append(hm2012).append(hm2004).append(hm2006).append(hm2008).append(hm2010).append(hm2014)

#### Party data

**Removing all third party candidates**

In [41]:
#Fixing formatting errors that came with the file.
hm = hm.replace(to_replace = 'R*',value='R').replace(to_replace = 'R ',value='R').replace(to_replace = 'D*',value='D').replace(to_replace = 'D* ',value='D').replace(to_replace = 'D ',value='D')

In [56]:
#Keeping only major party candiates
hm_party = hm[hm['PARTY'].isin(['R','D'])]

**Throwing out rows that are interupting analysis**

In [57]:
# Unnopposed candidates don't have vote counts associated with their races
hm_party2 = hm_party[~hm_party['PRIMARY VOTES'].isin(['Unopposed','*','#','14*','20*','1,040*','1,616**','Loser','Winner','Withdrew','Unoppsed'])]

In [58]:
hm_party2 = hm_party2[~hm_party2['TOTAL VOTES'].isin(['Party Votes:','Total State Votes:'])]

In [59]:
hm_party2 = hm_party2[~hm_party2['ID'].isin(['South Carolina2014R','Wyoming2008R'])]

**Find total votes for each party, each state **

In [60]:
#Creating DF that's only the merge ID and primary vote count
hm_partytotal = hm_party2[['ID','PRIMARY VOTES']]

In [16]:
hm_partytotal = hm_partytotal.dropna()

In [17]:
hm_partytotal["PRIMARY VOTES"]=hm_partytotal["PRIMARY VOTES"].astype(float)

In [92]:
#Summing the total number of votes for all primary candidates for each party
hm_partytotal = hm_partytotal.groupby(['ID']).sum().reset_index()

In [94]:
hm_partytotal = hm_partytotal.rename(columns = {'PRIMARY VOTES':'Total Party Votes'})

In [96]:
hm_partytotal["Total Party Votes"] = hm_partytotal["Total Party Votes"].astype(float)

**Counting how many candidates ran in the primary**

In [97]:
#Counting number of instances of each "ID" which represents one cadidate for each state, party and year.
NumRunning = hm_party2.groupby(['ID']).count().reset_index()

In [98]:
#Making a DF that's the count of the number of candidates, and whether an incumbant ran. 
NumRunning = NumRunning[['ID','PRIMARY VOTES','(I)']]

In [99]:
NumRunning['(I)'].value_counts()

0    259
1    145
Name: (I), dtype: int64

In [100]:
NumRunning = NumRunning.rename(columns = {'PRIMARY VOTES':'Candidate Count'})

**Finding identifying the max vote for each party, each state**

In [101]:
PartyWin = hm_party2[['ID','PRIMARY VOTES','STATE','YEAR','PARTY']]

In [102]:
PartyWin = PartyWin.dropna()

In [103]:
PartyWin["PRIMARY VOTES"]= PartyWin["PRIMARY VOTES"].astype(float)

In [104]:
#Grouping on unique state/party/year ID and identifying the max votes received. 
PartyWin = PartyWin.groupby(['ID']).max().reset_index()

In [105]:
PartyWin = PartyWin.rename(columns = {'PRIMARY VOTES':'Primary Winner'})

**Merging Columns Back Together**

In [106]:
#Merging DFs with calculated number of candidates running, number of candidates per party, and number of votes of party primary winner. 
votes_merged = hm_partytotal.merge(NumRunning, left_on='ID', right_on='ID', how='left')

In [107]:
votes_merged = votes_merged.merge(PartyWin, left_on='ID', right_on='ID', how = 'left')

In [108]:
votes_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404 entries, 0 to 403
Data columns (total 8 columns):
ID                   404 non-null object
Total Party Votes    404 non-null float64
Candidate Count      404 non-null int64
(I)                  404 non-null int64
Primary Winner       390 non-null float64
STATE                390 non-null object
YEAR                 390 non-null object
PARTY                390 non-null object
dtypes: float64(2), int64(2), object(4)
memory usage: 28.4+ KB


**Calcuating the total number of votes received by losers of the primaries**

In [112]:
votes_merged['Primary Loser Vote'] = votes_merged['Total Party Votes'] - votes_merged['Primary Winner']

In [89]:
votes_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404 entries, 0 to 403
Data columns (total 8 columns):
ID                   404 non-null object
Total Party Votes    395 non-null object
PRIMARY VOTES        404 non-null int64
(I)                  404 non-null int64
Primary Winner       390 non-null float64
STATE                390 non-null object
YEAR                 390 non-null object
PARTY                390 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 28.4+ KB


In [115]:
votes_merged

Unnamed: 0,ID,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote
0,Alabama2008D,1.120744e+15,3,0,112074.0,Alabama,2008,D,1.120744e+15
1,Alabama2008R,1.996902e+10,2,1,199690.0,Alabama,2008,R,1.996882e+10
2,Alabama2010D,1.609931e+11,2,0,160993.0,Alabama,2010,D,1.609929e+11
3,Alabama2010R,4.053988e+10,2,1,405398.0,Alabama,2010,R,4.053947e+10
4,Alabama2016D,1.538971e+11,2,0,153897.0,Alabama,2016,D,1.538970e+11
5,Alabama2016R,5.055862e+26,5,1,505586.0,Alabama,2016,R,5.055862e+26
6,Alaska2004D,4.088111e+12,3,0,40881.0,Alaska,2004,D,4.088111e+12
7,Alaska2004R,4.571029e+16,4,1,45710.0,Alaska,2004,R,4.571029e+16
8,Alaska2008D,6.374755e+11,3,0,63747.0,Alaska,2008,D,6.374754e+11
9,Alaska2008R,6.690028e+27,7,1,66900.0,Alaska,2008,R,6.690028e+27


#### Final Results Data

Creating column with final election results numbers.
These will be the target values the model trains on. 

In [116]:
hm_finals = hm[hm['PARTY'].isin(['R','D'])]

In [117]:
hm_finals = hm_finals[['GENERAL VOTES','ID','ID2']]

In [118]:
hm_finals = hm_finals.dropna()

#### Merging

Merging the final results data and the third party data to the master merged dataframe.

In [120]:
hm_merged = hm_finals.merge(votes_merged, left_on='ID', right_on='ID', how='left')

** Throwing out NaNs for races that didn't have primary data**

In [121]:
hm_merged = hm_merged.dropna()

**Loop adding a column for whilch party was in the oval office during the time of the election.**

In [129]:
Republican = ['2008','2006','2004','2002','1992','1990']

OfficeParty = []


for year in hm_merged['YEAR']:
    if year in Republican:
        OfficeParty.append('R')
        
    else:
        OfficeParty.append('D')
    
hm_merged['OFFICE PARTY'] = OfficeParty

** Adding a Column for whether or not a candiate's party was in office during the time of the election.**

In [130]:
hm_merged['Party In House'] = np.where(hm_merged['PARTY']== hm_merged['OFFICE PARTY'], 1,0)

In [131]:
hm_model_ready = hm_merged.drop('ID',axis=1).drop('ID2',axis=1).drop('OFFICE PARTY',axis=1)

In [132]:
hm_model_ready = hm_model_ready[~hm_model_ready['GENERAL VOTES'].isin(['#'])]

In [133]:
hm_model_ready

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
0,1.3351e+06,5.055862e+26,5.0,1.0,505586.0,Alabama,2016,R,5.055862e+26,0
1,748709,1.538971e+11,2.0,0.0,153897.0,Alabama,2016,D,1.538970e+11,1
2,138149,3.954585e+16,4.0,1.0,39545.0,Alaska,2016,R,3.954585e+16,0
3,36200,1.522810e+09,2.0,0.0,15228.0,Alaska,2016,D,1.522795e+09,1
4,1.35927e+06,3.025322e+21,4.0,1.0,302532.0,Arizona,2016,R,3.025322e+21,0
5,1.03124e+06,3.335860e+05,1.0,0.0,333586.0,Arizona,2016,D,0.000000e+00,1
6,661984,2.980399e+10,2.0,1.0,298039.0,Arkansas,2016,R,2.980369e+10,0
8,7.54276e+06,3.000689e+39,7.0,0.0,3000689.0,California,2016,D,3.000689e+39,1
9,4.70142e+06,3.000689e+39,7.0,0.0,3000689.0,California,2016,D,3.000689e+39,1
10,1.37071e+06,2.623440e+05,1.0,1.0,262344.0,Colorado,2016,D,0.000000e+00,1


In [134]:
hm_model_ready["GENERAL VOTES"]= hm_model_ready["GENERAL VOTES"].astype(float)
hm_model_ready["Party In House"]= hm_model_ready["Party In House"].astype(float)

## Modeling 

#### EDA & Futhur Cleaning

In [135]:
hm_model_ready.describe()

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,Primary Loser Vote,Party In House
count,337.0,337.0,337.0,337.0,337.0,337.0,337.0
mean,1121743.0,9.463103e+74,3.424332,0.37092,295909.1,9.463103e+74,0.492582
std,1195753.0,1.4412400000000002e+76,2.572936,0.483769,395804.4,1.4412400000000002e+76,0.500688
min,29377.0,0.0,1.0,0.0,6110.0,-516183.0,0.0
25%,286409.0,729137.0,2.0,0.0,70424.0,0.0,0.0
50%,806787.0,12702370000000.0,3.0,0.0,166627.0,12702370000000.0,0.0
75%,1479471.0,5.282663e+21,5.0,1.0,389613.0,5.282663e+21,1.0
max,7864624.0,2.5754520000000003e+77,18.0,1.0,3000689.0,2.5754520000000003e+77,1.0


In [136]:
hm_model_ready.sort_values(by=['GENERAL VOTES'])

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
414,29377.0,7.200301e+15,4.0,0.0,7200.0,Wyoming,2014,D,7.200301e+15,1.0
3,36200.0,1.522810e+09,2.0,0.0,15228.0,Alaska,2016,D,1.522795e+09,1.0
141,53019.0,9.173463e+11,3.0,0.0,9173.0,Wyoming,2012,D,9.173463e+11,1.0
273,57671.0,2.492400e+04,1.0,0.0,24924.0,Wyoming,2006,D,0.000000e+00,0.0
277,60045.0,1.803569e+12,3.0,0.0,18035.0,Alaska,2010,D,1.803569e+12,1.0
250,64417.0,4.064700e+04,1.0,0.0,40647.0,North Dakota,2006,R,0.000000e+00,1.0
219,69734.0,6.110577e+07,2.0,0.0,6110.0,Delaware,2006,R,6.109966e+07,1.0
335,72699.0,2.275000e+04,1.0,0.0,22750.0,Vermont,2010,R,0.000000e+00,0.0
131,72898.0,6.358208e+07,2.0,0.0,6358.0,Vermont,2012,R,6.357573e+07,0.0
206,75398.0,9.591206e+11,3.0,0.0,9591.0,Vermont,2004,R,9.591206e+11,1.0


In [137]:
#Looks like there are some rogue additional states...
len(hm_model_ready['STATE'].value_counts())

51

In [138]:
hm_model_ready['STATE'].unique()

##Virgin Islands not a US state
##Missouri counted twice due to double-space error

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois',
       'Indiana', 'Iowa', 'Kentucky', 'Maryland', 'Missouri', 'Nevada',
       'New Hampshire', 'North Carolina', 'North Dakota', 'Ohio',
       'Oregon', 'Pennsylvania', 'Utah', 'Vermont', 'Washington',
       'Wisconsin', 'Connecticut', 'Delaware', 'Maine', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri ', 'Montana',
       'Nebraska', 'New Jersey', 'New Mexico', 'New York', 'Rhode Island',
       'Tennessee', 'Texas', 'Virginia', 'West Virginia', 'Wyoming',
       'Kansas', 'Oklahoma', 'South Carolina', 'Louisiana',
       'South Dakota'], dtype=object)

In [139]:
#Dropping VI
hm_model_ready2 = hm_model_ready[~hm_model_ready['STATE'].isin(['Virgin Islands'])]

In [140]:
#Removing spaces

hm_model_ready2 = hm_model_ready2.replace(to_replace = 'Missouri ',value='Missouri')

In [141]:
len(hm_model_ready2['STATE'].unique())

50

In [143]:
#Counting the number of Incumbants who ran
hm_model_ready2['(I)'].value_counts()

0.0    212
1.0    125
Name: (I), dtype: int64

In [144]:
hm_model_ready2.groupby(['STATE']).sum().sort_values(['GENERAL VOTES'])

Unnamed: 0_level_0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,Primary Loser Vote,Party In House
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
South Dakota,140741.0,4.137714e+22,5.0,0.0,41377.0,4.137714e+22,0.0
Delaware,557025.0,3735125000.0,8.0,1.0,98441.0,3735026000.0,2.0
Wyoming,582045.0,7.796665e+20,17.0,3.0,270989.0,7.796665e+20,3.0
North Dakota,771729.0,6786283000.0,6.0,1.0,316588.0,6785966000.0,2.0
Alaska,880306.0,4.559298e+18,23.0,3.0,318109.0,4.559298e+18,4.0
Vermont,970052.0,962282200000.0,14.0,3.0,252712.0,962281900000.0,4.0
Kansas,1091200.0,34789190000.0,4.0,1.0,347891.0,34788840000.0,1.0
Rhode Island,1118726.0,6933684000000.0,9.0,3.0,294531.0,6933683000000.0,3.0
Louisiana,1191987.0,786614500000000.0,6.0,1.0,163034.0,786614500000000.0,1.0
Montana,1212195.0,6.582798e+20,18.0,3.0,489773.0,6.582798e+20,3.0


### Modeling Trial 1 - Catboost (With All Inputs Included) 

** Splitting x&y (predictors and target) **

In [145]:
hm_model_ready2

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
0,1335104.0,5.055862e+26,5.0,1.0,505586.0,Alabama,2016,R,5.055862e+26,0.0
1,748709.0,1.538971e+11,2.0,0.0,153897.0,Alabama,2016,D,1.538970e+11,1.0
2,138149.0,3.954585e+16,4.0,1.0,39545.0,Alaska,2016,R,3.954585e+16,0.0
3,36200.0,1.522810e+09,2.0,0.0,15228.0,Alaska,2016,D,1.522795e+09,1.0
4,1359267.0,3.025322e+21,4.0,1.0,302532.0,Arizona,2016,R,3.025322e+21,0.0
5,1031245.0,3.335860e+05,1.0,0.0,333586.0,Arizona,2016,D,0.000000e+00,1.0
6,661984.0,2.980399e+10,2.0,1.0,298039.0,Arkansas,2016,R,2.980369e+10,0.0
8,7542759.0,3.000689e+39,7.0,0.0,3000689.0,California,2016,D,3.000689e+39,1.0
9,4701417.0,3.000689e+39,7.0,0.0,3000689.0,California,2016,D,3.000689e+39,1.0
10,1370710.0,2.623440e+05,1.0,1.0,262344.0,Colorado,2016,D,0.000000e+00,1.0


In [170]:
ycat = hm_model_ready2['GENERAL VOTES'] 
Xcat = hm_model_ready2.drop('GENERAL VOTES',axis = 1)

** Splitting for T-T-S **

In [173]:
X_trainc, X_testc, y_trainc, y_testc = train_test_split(Xcat, ycat, test_size=0.3, random_state=42)

** Running Catboost **

In [148]:
X_trainc.columns

Index(['Total Party Votes', 'Candidate Count', '(I)', 'Primary Winner',
       'STATE', 'YEAR', 'PARTY', 'Primary Loser Vote', 'Party In House'],
      dtype='object')

In [174]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_trainc.dtypes != np.float)[0]
train_pool = Pool(X_trainc, y_trainc, cat_features=categorical_features_indices)
test_pool = Pool(X_testc, y_testc, cat_features=categorical_features_indices)


Catset=CatBoostRegressor(learning_rate = 0.01, depth = 10)
Cat = Catset.fit(X_trainc, y_trainc, cat_features=categorical_features_indices, plot=True, use_best_model=True)

You should provide test set for use best model. use_best_model parameter swiched to false value.


0:	learn: 1688227.3219372	total: 57.5ms	remaining: 57.4s
1:	learn: 1677629.6992692	total: 66.7ms	remaining: 33.3s
2:	learn: 1667724.3253534	total: 102ms	remaining: 34s
3:	learn: 1655510.9678178	total: 106ms	remaining: 26.4s
4:	learn: 1647254.3986926	total: 109ms	remaining: 21.6s
5:	learn: 1636004.3180621	total: 110ms	remaining: 18.3s
6:	learn: 1627501.7452530	total: 112ms	remaining: 15.9s
7:	learn: 1616532.1717697	total: 115ms	remaining: 14.2s
8:	learn: 1605503.8694167	total: 117ms	remaining: 12.9s
9:	learn: 1596466.7905542	total: 120ms	remaining: 11.8s
10:	learn: 1585297.9051529	total: 121ms	remaining: 10.9s
11:	learn: 1573943.6948608	total: 126ms	remaining: 10.4s
12:	learn: 1564809.7368270	total: 130ms	remaining: 9.84s
13:	learn: 1554704.2208400	total: 132ms	remaining: 9.32s
14:	learn: 1543931.8499786	total: 134ms	remaining: 8.81s
15:	learn: 1534441.0696169	total: 137ms	remaining: 8.4s
16:	learn: 1524225.3688867	total: 138ms	remaining: 7.99s
17:	learn: 1513523.2547719	total: 141ms	re

153:	learn: 791609.5869427	total: 988ms	remaining: 5.43s
154:	learn: 788323.2738033	total: 1s	remaining: 5.47s
155:	learn: 785727.4044688	total: 1s	remaining: 5.44s
156:	learn: 782717.0259620	total: 1.01s	remaining: 5.42s
157:	learn: 781499.4320947	total: 1.01s	remaining: 5.39s
158:	learn: 778625.5059235	total: 1.02s	remaining: 5.42s
159:	learn: 775419.4595919	total: 1.03s	remaining: 5.4s
160:	learn: 772874.0166253	total: 1.03s	remaining: 5.38s
161:	learn: 770191.1753177	total: 1.03s	remaining: 5.35s
162:	learn: 767249.6425954	total: 1.04s	remaining: 5.33s
163:	learn: 765233.0942878	total: 1.04s	remaining: 5.32s
164:	learn: 762305.5312112	total: 1.04s	remaining: 5.29s
165:	learn: 758997.2670098	total: 1.05s	remaining: 5.28s
166:	learn: 756218.1930269	total: 1.05s	remaining: 5.26s
167:	learn: 754736.7335254	total: 1.06s	remaining: 5.24s
168:	learn: 752873.5162095	total: 1.06s	remaining: 5.21s
169:	learn: 750447.7970002	total: 1.08s	remaining: 5.29s
170:	learn: 747636.4969630	total: 1.09

317:	learn: 555149.0332553	total: 2.04s	remaining: 4.37s
318:	learn: 554833.6400652	total: 2.04s	remaining: 4.35s
319:	learn: 553784.2521836	total: 2.05s	remaining: 4.35s
320:	learn: 552709.0371896	total: 2.05s	remaining: 4.34s
321:	learn: 551647.4297972	total: 2.07s	remaining: 4.36s
322:	learn: 550879.6231886	total: 2.07s	remaining: 4.35s
323:	learn: 550788.9630069	total: 2.08s	remaining: 4.33s
324:	learn: 550068.3093252	total: 2.08s	remaining: 4.31s
325:	learn: 549149.6387943	total: 2.08s	remaining: 4.29s
326:	learn: 548655.4662935	total: 2.08s	remaining: 4.28s
327:	learn: 547795.3313378	total: 2.08s	remaining: 4.26s
328:	learn: 547072.3462631	total: 2.08s	remaining: 4.24s
329:	learn: 546507.0515084	total: 2.08s	remaining: 4.23s
330:	learn: 545693.5421099	total: 2.08s	remaining: 4.21s
331:	learn: 544737.2049610	total: 2.08s	remaining: 4.19s
332:	learn: 544475.5479888	total: 2.08s	remaining: 4.18s
333:	learn: 543523.2061570	total: 2.09s	remaining: 4.16s
334:	learn: 542503.7994615	tota

484:	learn: 463234.9620585	total: 2.84s	remaining: 3.01s
485:	learn: 462711.4994484	total: 2.84s	remaining: 3s
486:	learn: 461773.7245871	total: 2.85s	remaining: 3s
487:	learn: 461123.9780767	total: 2.86s	remaining: 3s
488:	learn: 460847.2597032	total: 2.86s	remaining: 2.99s
489:	learn: 460459.3430705	total: 2.87s	remaining: 2.98s
490:	learn: 459943.5587833	total: 2.87s	remaining: 2.98s
491:	learn: 459513.8828664	total: 2.88s	remaining: 2.97s
492:	learn: 459373.0577321	total: 2.88s	remaining: 2.96s
493:	learn: 459088.5186732	total: 2.88s	remaining: 2.95s
494:	learn: 458689.7148597	total: 2.88s	remaining: 2.94s
495:	learn: 458373.1796031	total: 2.88s	remaining: 2.93s
496:	learn: 457516.2650198	total: 2.9s	remaining: 2.94s
497:	learn: 457212.3592880	total: 2.91s	remaining: 2.93s
498:	learn: 456531.8917939	total: 2.92s	remaining: 2.93s
499:	learn: 455938.7279658	total: 2.92s	remaining: 2.92s
500:	learn: 455028.3397760	total: 2.93s	remaining: 2.92s
501:	learn: 454947.7896589	total: 2.93s	r

637:	learn: 397981.1893682	total: 4.08s	remaining: 2.32s
638:	learn: 397729.3307516	total: 4.09s	remaining: 2.31s
639:	learn: 397623.1923988	total: 4.09s	remaining: 2.3s
640:	learn: 397430.6437535	total: 4.1s	remaining: 2.29s
641:	learn: 397030.3506369	total: 4.1s	remaining: 2.29s
642:	learn: 396269.7437785	total: 4.12s	remaining: 2.29s
643:	learn: 396094.3198738	total: 4.12s	remaining: 2.28s
644:	learn: 395696.5904708	total: 4.13s	remaining: 2.27s
645:	learn: 395435.1335230	total: 4.13s	remaining: 2.26s
646:	learn: 395418.5175651	total: 4.13s	remaining: 2.25s
647:	learn: 395240.0447910	total: 4.13s	remaining: 2.24s
648:	learn: 395050.8021165	total: 4.13s	remaining: 2.23s
649:	learn: 394322.3116303	total: 4.15s	remaining: 2.23s
650:	learn: 393761.9290187	total: 4.17s	remaining: 2.24s
651:	learn: 393590.6941784	total: 4.17s	remaining: 2.23s
652:	learn: 392860.1878859	total: 4.19s	remaining: 2.23s
653:	learn: 392660.5135881	total: 4.19s	remaining: 2.22s
654:	learn: 392275.9106137	total: 

792:	learn: 343519.5901584	total: 6.01s	remaining: 1.57s
793:	learn: 343176.0324476	total: 6.04s	remaining: 1.56s
794:	learn: 342788.5156222	total: 6.06s	remaining: 1.56s
795:	learn: 342359.9208881	total: 6.08s	remaining: 1.56s
796:	learn: 341848.9191076	total: 6.1s	remaining: 1.55s
797:	learn: 341119.6531104	total: 6.13s	remaining: 1.55s
798:	learn: 340856.1733906	total: 6.15s	remaining: 1.55s
799:	learn: 340427.1099446	total: 6.17s	remaining: 1.54s
800:	learn: 340035.8868468	total: 6.18s	remaining: 1.54s
801:	learn: 339505.2691490	total: 6.21s	remaining: 1.53s
802:	learn: 339308.1667546	total: 6.21s	remaining: 1.52s
803:	learn: 338942.9721236	total: 6.24s	remaining: 1.52s
804:	learn: 338598.8614764	total: 6.27s	remaining: 1.52s
805:	learn: 337774.0482671	total: 6.3s	remaining: 1.52s
806:	learn: 337469.8023983	total: 6.32s	remaining: 1.51s
807:	learn: 337261.7963168	total: 6.32s	remaining: 1.5s
808:	learn: 336798.3143964	total: 6.34s	remaining: 1.5s
809:	learn: 336097.4901768	total: 6

941:	learn: 297276.3785449	total: 8.8s	remaining: 542ms
942:	learn: 296909.7732025	total: 8.82s	remaining: 533ms
943:	learn: 296686.1988646	total: 8.84s	remaining: 524ms
944:	learn: 296528.1483646	total: 8.84s	remaining: 515ms
945:	learn: 296193.8073661	total: 8.86s	remaining: 506ms
946:	learn: 296133.5594724	total: 8.86s	remaining: 496ms
947:	learn: 295740.4581095	total: 8.88s	remaining: 487ms
948:	learn: 295651.1731447	total: 8.9s	remaining: 478ms
949:	learn: 295108.8608223	total: 8.91s	remaining: 469ms
950:	learn: 294713.0213905	total: 8.93s	remaining: 460ms
951:	learn: 294515.5470930	total: 8.95s	remaining: 451ms
952:	learn: 294319.3352469	total: 8.96s	remaining: 442ms
953:	learn: 294097.8442381	total: 8.96s	remaining: 432ms
954:	learn: 293621.1902155	total: 8.98s	remaining: 423ms
955:	learn: 293337.6963390	total: 9s	remaining: 414ms
956:	learn: 293032.8054663	total: 9.02s	remaining: 405ms
957:	learn: 292857.8148571	total: 9.04s	remaining: 396ms
958:	learn: 292508.5722060	total: 9.

In [175]:
## Checking out feature importance for this model. 

feature_importances = Cat.get_feature_importance(train_pool)
feature_names = X_trainc.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Primary Winner: 63.45367168834086
STATE: 19.724486501974773
YEAR: 5.437483784794085
Total Party Votes: 3.513279679821805
Primary Loser Vote: 3.4872241556398724
Candidate Count: 1.7806523175081774
(I): 1.2734354512146866
PARTY: 1.0096136716039157
Party In House: 0.32015274910182023


In [176]:
Cat.eval_metrics(data = test_pool, metrics =['RMSE'], plot = True)

{'RMSE': [1480151.3499668173,
  1471635.392626911,
  1464450.4199143786,
  1453736.8209167412,
  1446811.8312622977,
  1437839.2214345708,
  1430078.5348399337,
  1421029.44186573,
  1411884.9101654019,
  1405432.2937152067,
  1396455.3632857092,
  1387018.3935280622,
  1379313.5778342884,
  1371234.496477952,
  1362425.8612033236,
  1354467.963925453,
  1346201.0277897655,
  1338498.8147689677,
  1329729.2603133367,
  1322184.9175510695,
  1314410.0710287767,
  1308730.69288239,
  1303005.3987718597,
  1297657.6742860023,
  1290648.918335866,
  1285416.9364310238,
  1277798.883954262,
  1271026.6827656652,
  1264912.7521543505,
  1257507.600349949,
  1251288.8769597642,
  1244905.9303146433,
  1239275.59562923,
  1232648.103929197,
  1224428.8874719364,
  1218782.0940844966,
  1211125.550738681,
  1206655.762019574,
  1198764.1649850383,
  1193048.580675692,
  1186681.5847802036,
  1180503.5359692771,
  1173825.7053083542,
  1168747.5040782774,
  1162248.4873620735,
  1157374.96198166

In [177]:
Cat.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

{'R2': [0.004802865618705132,
  0.00895581221047037,
  0.01323226209832995,
  0.018880077688605557,
  0.01909870496374977,
  0.02362886873371972,
  0.025163629970973433,
  0.029923548853298843,
  0.03477720539126361,
  0.03688920158478981,
  0.041989613978127616,
  0.04846965628755384,
  0.052678912253961596,
  0.05735631593473811,
  0.06244181527174908,
  0.06662159572638138,
  0.07150806455861036,
  0.07585722122086036,
  0.08134376413345679,
  0.08588141057552368,
  0.09113008159957359,
  0.09349631774404332,
  0.09703814276580258,
  0.09930929753167084,
  0.10335314177263177,
  0.10683407315722782,
  0.11255514789806609,
  0.11640593239795616,
  0.1200107837897999,
  0.1253734708619585,
  0.12997893250516046,
  0.13412521607134176,
  0.13746592338231445,
  0.14289006831379358,
  0.14947862868808326,
  0.15459128652887855,
  0.1608655117480171,
  0.16446328768681961,
  0.17123068338732983,
  0.17635399351777747,
  0.18133830771337733,
  0.1866289588192811,
  0.1918754253917162,
  0.

In [178]:
Cat.score(X_testc,y_testc)

540369.388083104

### Modeling Trial 2 - Catboost (Testing changing inputs to ones deemed most influential)

**Slight improvement...**

In [253]:
hm_model_ready2.columns

Index(['GENERAL VOTES', 'Total Party Votes', 'Candidate Count', '(I)',
       'Primary Winner', 'STATE', 'YEAR', 'PARTY', 'Primary Loser Vote',
       'Party In House'],
      dtype='object')

In [285]:
ycat2 = hm_model_ready2['GENERAL VOTES'] 
Xcat2 = hm_model_ready2[['Primary Winner','Primary Loser Vote','STATE','PARTY']]

In [286]:
X_trainc2, X_testc2, y_trainc2, y_testc2 = train_test_split(Xcat2, ycat2, test_size=0.2, random_state=42)

In [287]:
categorical_features_indices = np.where(X_trainc2.dtypes != np.float)[0]
train_pool = Pool(X_trainc2, y_trainc2, cat_features=categorical_features_indices)
test_pool = Pool(X_testc2, y_testc2, cat_features=categorical_features_indices)


Cat = Catset.fit(X_trainc2, y_trainc2, cat_features=categorical_features_indices);

0:	learn: 1684757.4033906	total: 21.7ms	remaining: 21.7s
1:	learn: 1672926.1463754	total: 24.9ms	remaining: 12.4s
2:	learn: 1662097.8661823	total: 27.6ms	remaining: 9.18s
3:	learn: 1650560.6831565	total: 31.5ms	remaining: 7.84s
4:	learn: 1640039.0244205	total: 33.6ms	remaining: 6.68s
5:	learn: 1628434.9068460	total: 43.7ms	remaining: 7.24s
6:	learn: 1617483.1188415	total: 45.8ms	remaining: 6.49s
7:	learn: 1606895.0334427	total: 64.5ms	remaining: 7.99s
8:	learn: 1597455.6307280	total: 68.6ms	remaining: 7.55s
9:	learn: 1586104.3888625	total: 72ms	remaining: 7.13s
10:	learn: 1575613.8285231	total: 74.8ms	remaining: 6.73s
11:	learn: 1565238.0265148	total: 77.8ms	remaining: 6.41s
12:	learn: 1555449.1153017	total: 81.3ms	remaining: 6.17s
13:	learn: 1544666.0901005	total: 85.3ms	remaining: 6.01s
14:	learn: 1534290.9779754	total: 108ms	remaining: 7.07s
15:	learn: 1523999.4812966	total: 125ms	remaining: 7.68s
16:	learn: 1513293.3566697	total: 126ms	remaining: 7.28s
17:	learn: 1501937.0786946	to

162:	learn: 743726.1610513	total: 807ms	remaining: 4.14s
163:	learn: 741581.1876598	total: 811ms	remaining: 4.13s
164:	learn: 739458.3098001	total: 815ms	remaining: 4.12s
165:	learn: 737046.7238544	total: 825ms	remaining: 4.14s
166:	learn: 734797.0642054	total: 826ms	remaining: 4.12s
167:	learn: 732487.1229818	total: 827ms	remaining: 4.1s
168:	learn: 730061.8287565	total: 842ms	remaining: 4.14s
169:	learn: 728063.0372687	total: 843ms	remaining: 4.12s
170:	learn: 725698.0705422	total: 845ms	remaining: 4.09s
171:	learn: 723606.3395625	total: 846ms	remaining: 4.07s
172:	learn: 721212.7586141	total: 847ms	remaining: 4.05s
173:	learn: 719657.6640238	total: 848ms	remaining: 4.03s
174:	learn: 717090.6560033	total: 849ms	remaining: 4s
175:	learn: 715031.2543785	total: 851ms	remaining: 3.98s
176:	learn: 712771.2054536	total: 864ms	remaining: 4.02s
177:	learn: 711219.3929841	total: 865ms	remaining: 4s
178:	learn: 710372.2886528	total: 866ms	remaining: 3.97s
179:	learn: 708242.8558272	total: 867m

342:	learn: 536880.3497144	total: 1.62s	remaining: 3.1s
343:	learn: 535928.0304018	total: 1.64s	remaining: 3.12s
344:	learn: 535238.4780527	total: 1.65s	remaining: 3.14s
345:	learn: 534555.7052395	total: 1.65s	remaining: 3.13s
346:	learn: 534419.1709954	total: 1.65s	remaining: 3.11s
347:	learn: 533396.9618940	total: 1.67s	remaining: 3.12s
348:	learn: 533017.9378405	total: 1.67s	remaining: 3.12s
349:	learn: 532559.8594066	total: 1.68s	remaining: 3.11s
350:	learn: 531685.4985764	total: 1.68s	remaining: 3.1s
351:	learn: 530842.6104369	total: 1.69s	remaining: 3.12s
352:	learn: 530042.6965823	total: 1.7s	remaining: 3.11s
353:	learn: 529525.4413521	total: 1.7s	remaining: 3.1s
354:	learn: 529454.3463070	total: 1.7s	remaining: 3.09s
355:	learn: 528962.8307405	total: 1.7s	remaining: 3.08s
356:	learn: 528512.4336211	total: 1.71s	remaining: 3.09s
357:	learn: 527892.4979798	total: 1.73s	remaining: 3.09s
358:	learn: 527384.1658182	total: 1.73s	remaining: 3.08s
359:	learn: 526852.8182326	total: 1.73

487:	learn: 473298.8308882	total: 2.21s	remaining: 2.31s
488:	learn: 473220.4708102	total: 2.21s	remaining: 2.31s
489:	learn: 473151.2057130	total: 2.21s	remaining: 2.3s
490:	learn: 472506.5403383	total: 2.23s	remaining: 2.31s
491:	learn: 472208.1567865	total: 2.23s	remaining: 2.3s
492:	learn: 471824.2960352	total: 2.24s	remaining: 2.31s
493:	learn: 471411.5874745	total: 2.25s	remaining: 2.3s
494:	learn: 470394.7020716	total: 2.26s	remaining: 2.31s
495:	learn: 470307.0504656	total: 2.26s	remaining: 2.3s
496:	learn: 469782.0776022	total: 2.28s	remaining: 2.3s
497:	learn: 469773.7039188	total: 2.28s	remaining: 2.3s
498:	learn: 468884.1107500	total: 2.29s	remaining: 2.3s
499:	learn: 468194.3660137	total: 2.31s	remaining: 2.31s
500:	learn: 467836.8670660	total: 2.31s	remaining: 2.3s
501:	learn: 467511.2521505	total: 2.31s	remaining: 2.29s
502:	learn: 467081.4939662	total: 2.31s	remaining: 2.28s
503:	learn: 466788.0351660	total: 2.31s	remaining: 2.28s
504:	learn: 466639.7862468	total: 2.33s

639:	learn: 426693.0145831	total: 3.02s	remaining: 1.7s
640:	learn: 426531.1921319	total: 3.02s	remaining: 1.69s
641:	learn: 426449.4459240	total: 3.03s	remaining: 1.69s
642:	learn: 426065.9827394	total: 3.03s	remaining: 1.68s
643:	learn: 426004.3519097	total: 3.04s	remaining: 1.68s
644:	learn: 425884.3381448	total: 3.04s	remaining: 1.68s
645:	learn: 425562.5005740	total: 3.06s	remaining: 1.67s
646:	learn: 424913.6443509	total: 3.07s	remaining: 1.67s
647:	learn: 424741.6673421	total: 3.07s	remaining: 1.67s
648:	learn: 424449.2075056	total: 3.08s	remaining: 1.67s
649:	learn: 423953.6514663	total: 3.09s	remaining: 1.67s
650:	learn: 423790.6602635	total: 3.1s	remaining: 1.66s
651:	learn: 423325.9961816	total: 3.11s	remaining: 1.66s
652:	learn: 423174.6213260	total: 3.11s	remaining: 1.65s
653:	learn: 423123.2420348	total: 3.11s	remaining: 1.64s
654:	learn: 423021.4888172	total: 3.11s	remaining: 1.64s
655:	learn: 422866.1643991	total: 3.11s	remaining: 1.63s
656:	learn: 422712.7896118	total:

803:	learn: 384827.3324893	total: 4.24s	remaining: 1.03s
804:	learn: 384756.7108312	total: 4.25s	remaining: 1.03s
805:	learn: 384449.9294768	total: 4.26s	remaining: 1.02s
806:	learn: 383971.6310296	total: 4.28s	remaining: 1.02s
807:	learn: 383484.4049671	total: 4.29s	remaining: 1.02s
808:	learn: 383471.7846268	total: 4.29s	remaining: 1.01s
809:	learn: 383093.5715514	total: 4.3s	remaining: 1.01s
810:	learn: 382835.3396452	total: 4.32s	remaining: 1.01s
811:	learn: 382569.9923090	total: 4.33s	remaining: 1s
812:	learn: 382203.8048182	total: 4.34s	remaining: 1000ms
813:	learn: 381969.6012990	total: 4.36s	remaining: 996ms
814:	learn: 381902.2831973	total: 4.36s	remaining: 990ms
815:	learn: 381746.2400434	total: 4.38s	remaining: 987ms
816:	learn: 381173.2692076	total: 4.39s	remaining: 983ms
817:	learn: 380502.1291970	total: 4.4s	remaining: 980ms
818:	learn: 380285.4801077	total: 4.42s	remaining: 976ms
819:	learn: 379950.2064185	total: 4.43s	remaining: 972ms
820:	learn: 379949.8414589	total: 4

958:	learn: 344575.5931900	total: 6.12s	remaining: 262ms
959:	learn: 344542.6208450	total: 6.12s	remaining: 255ms
960:	learn: 344131.6120990	total: 6.14s	remaining: 249ms
961:	learn: 344099.1282843	total: 6.14s	remaining: 243ms
962:	learn: 344074.1379581	total: 6.15s	remaining: 236ms
963:	learn: 343754.6215265	total: 6.17s	remaining: 230ms
964:	learn: 343523.7172161	total: 6.18s	remaining: 224ms
965:	learn: 343270.7274833	total: 6.19s	remaining: 218ms
966:	learn: 343239.0110920	total: 6.2s	remaining: 211ms
967:	learn: 342811.5248421	total: 6.21s	remaining: 205ms
968:	learn: 342395.7724494	total: 6.22s	remaining: 199ms
969:	learn: 342179.8396809	total: 6.24s	remaining: 193ms
970:	learn: 341919.7561112	total: 6.25s	remaining: 187ms
971:	learn: 341726.3318507	total: 6.26s	remaining: 180ms
972:	learn: 341658.2803223	total: 6.28s	remaining: 174ms
973:	learn: 341627.9508467	total: 6.28s	remaining: 168ms
974:	learn: 341458.7917332	total: 6.29s	remaining: 161ms
975:	learn: 341395.0380270	total

In [288]:
Cat.score(X_testc2, y_testc2)

427378.79611818655

In [289]:
Cat.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

{'R2': [0.006417431881620139,
  0.012335995497816055,
  0.016695749655571634,
  0.023478551342188125,
  0.028109475619248037,
  0.03439677690081544,
  0.03943078792495569,
  0.04578071388719218,
  0.05027279663067752,
  0.056486447187428124,
  0.061793019405720684,
  0.0672571503281778,
  0.07357622102122297,
  0.07955580815846064,
  0.08660038431807249,
  0.0934725977240567,
  0.09986479686919914,
  0.1067125385384966,
  0.11295491362065735,
  0.12003573107302412,
  0.1262732098106879,
  0.13301906495722038,
  0.14050325401621666,
  0.1472238375100664,
  0.15186731244022578,
  0.15765507361461073,
  0.16339190574741025,
  0.16956173655056106,
  0.17456365417745456,
  0.17977404022501442,
  0.18584025712797492,
  0.19073178415478642,
  0.19593790770896047,
  0.20126873892210884,
  0.20716868471229477,
  0.21310510914848668,
  0.2191548871297908,
  0.22496338412157368,
  0.23056623652443842,
  0.23305509183706974,
  0.23750197456284616,
  0.24241940349451108,
  0.24830933663133836,
  0.

### Modeling Trial 3 - (Elastic Net Linear Regression)

In [194]:
ylr = hm_model_ready['GENERAL VOTES'] 
Xlr = hm_model_ready[['Primary Winner','Primary Loser Vote','Party In House','(I)']]

In [195]:
Xlr

Unnamed: 0,Primary Winner,Primary Loser Vote,Party In House,(I)
0,505586.0,5.055862e+26,0.0,1.0
1,153897.0,1.538970e+11,1.0,0.0
2,39545.0,3.954585e+16,0.0,1.0
3,15228.0,1.522795e+09,1.0,0.0
4,302532.0,3.025322e+21,0.0,1.0
5,333586.0,0.000000e+00,1.0,0.0
6,298039.0,2.980369e+10,0.0,1.0
8,3000689.0,3.000689e+39,1.0,0.0
9,3000689.0,3.000689e+39,1.0,0.0
10,262344.0,0.000000e+00,1.0,1.0


In [196]:
X_trainlr, X_testlr, y_trainlr, y_testlr = train_test_split(Xlr, ylr, test_size=0.3, random_state=42)

In [197]:
#Checking the predictability of the model with this alpha = 1
EN = ElasticNet(1, l1_ratio=.5)
LR = EN.fit(X_trainlr, y_trainlr)
rsq = LR.score(X_testlr, y_testlr)
adj_rsq = 1 - (1-rsq)*(len(ylr)-1)/(len(ylr)-Xlr.shape[1]-1)

preds = LR.predict(X_testlr)
rmse = np.sqrt(MSE(preds, y_testlr))

print(rsq)
print(adj_rsq)
print(rmse)


-0.1237935823195826
-0.13733326403427637
1226186.6329033938




#### Linear Regression (Logged)

In [199]:
hm_model_log = hm_model_ready

hm_model_log['Primary Winner']=np.log(hm_model_log['Primary Winner'])
hm_model_log['Primary Loser Vote']=np.log(hm_model_log['Primary Loser Vote'])
hm_model_log['GENERAL VOTES']=np.log(hm_model_log['GENERAL VOTES'])

  after removing the cwd from sys.path.


In [202]:
ylr2 = hm_model_log['GENERAL VOTES'] 
Xlr2 = hm_model_log[['Primary Winner','Primary Loser Vote','Party In House','(I)']]

In [203]:
ylr2

0      14.104520
1      13.526106
2      11.836088
3      10.496814
4      14.122456
5      13.846277
6      13.402997
8      15.836099
9      15.363375
10     14.130839
11     14.010516
14     15.391431
15     15.231870
16     14.574355
17     14.285343
18     12.633312
19     11.436617
20     13.014816
21     12.145521
22     14.918427
23     14.596985
24     14.168974
25     13.963022
26     13.738637
27     13.216691
28     13.901851
29     13.608789
46     14.322272
47     13.787684
48     14.136476
         ...    
380    12.272371
381    11.906210
382    12.758911
383    12.044300
384    12.433941
385    12.368816
386    13.858442
387    13.581429
388    12.565176
389    12.341901
390    14.168460
391    14.135890
392    13.232412
393    12.364388
394    13.230324
395    12.379702
396    13.197187
397    12.317949
398    11.436951
400    13.031839
402    13.038075
403    11.854677
405    13.653094
406    12.989627
407    14.866867
408    14.283880
411    12.549024
412    11.9599

In [204]:
Xlr2 = Xlr2.replace([np.inf, -np.inf], 0).fillna(0)


In [205]:
X_trainlr2, X_testlr2, y_trainlr2, y_testlr2 = train_test_split(Xlr2, ylr2, test_size=0.2, random_state=42)

In [206]:
#Checking the predictability of the model with this alpha = 1
EN = ElasticNet(1, l1_ratio=.5)
LR = EN.fit(X_trainlr2, y_trainlr2)
rsq = LR.score(X_testlr2, y_testlr2)
adj_rsq = 1 - (1-rsq)*(len(ylr2)-1)/(len(ylr2)-Xlr2.shape[1]-1)

preds = LR.predict(X_testlr2)
rmse = np.sqrt(MSE(preds, y_testlr2))

print(rsq)
print(adj_rsq)
print(rmse)

-0.09017337771523315
-0.10330799672385038
1.1390715912860274


### Modeling Trial 4 - (Splitting DFs by state-size)

*Didn't end up helping much. Let's just go back to the one with the full train set*

**Grouped states by electoral college vote:**<br>

**Smallest:** <br>
- *3 Electoral votes:* Alaska, Delaware, Montana, North Dakota, South Dakota, Vermont, Wyoming
- *4 Electoral votes:* Hawaii, Idaho, Maine, New Hampshire, Rhode Island
- *5 Electoral votes:* Nebraska, New Mexico, West Virginia

**Medium:** <br>
- *6 Electoral votes:* Arkansas, Iowa ,Kansas, Mississippi, Nevada, Utah
- *7 Electoral votes:* Connecticut, Oklahoma, Oregon
- *8 Electoral votes:* Kentucky, Louisiana
- *9 Electoral votes:* Alabama, Colorado, South Carolina
- *10 Electoral votes:* Maryland, Minnesota, Missouri, Wisconsin
- *11 Electoral votes:* Arizona, Indiana, Massachusetts, Tennessee

**Medium-Large:** <br>
- *12 Electoral votes:* Washington 
- *13 Electoral votes:* Virginia
- *14 Electoral votes:* New Jersey
- *15 Electoral votes:* North Carolina
- *16 Electoral votes:* Georgia, Michigan
- *18 Electoral votes:* Ohio

**Large:** <br>
- *20 Electoral votes:* Illinois, Pennsylvania
- *29 Electoral votes:* Florida, New York
- *38 Electoral votes:* Texas
- *55 Electoral votes:* California


#### Small States

In [None]:
low_density = ['Alaska', 'Delaware', 'Montana', 'North Dakota', 'South Dakota', 'Vermont', 'Wyoming', 'Hawaii', 'Idaho', 'Maine', 'New Hampshire', 'Rhode Island', 'Nebraska', 'New Mexico', 'West Virginia']

small = hm_model_ready2[hm_model_ready2['STATE'].isin(low_density)].drop('Third Party Votes',axis = 1)

In [None]:
# small = small.drop(small[small['Primary Loser Vote'] < 1].index)

In [None]:
small.describe()

**Splitting X&y**

In [None]:
ysmall = small['GENERAL VOTES'] 
Xsmall = small.drop('GENERAL VOTES',axis = 1)

**Splitting for T-T-S**

In [None]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(Xsmall, ysmall, test_size=0.3)

**Running Catboost**

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_s.dtypes != np.float)[0]
train_pool = Pool(X_train_s, y_train_s, cat_features=categorical_features_indices)
test_pool = Pool(X_test_s, y_test_s, cat_features=categorical_features_indices)


Cat_s = Catset.fit(X_train_s, y_train_s, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
#Would like to get it down to less than 20,000...
Cat_s.score(X_test_s, y_test_s)

In [None]:
## Checking out feature importance for this model. 

feature_importances = Cat_s.get_feature_importance(train_pool)
feature_names = X_train_s.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

In [None]:
Cat_s.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

#### Medium States

In [None]:
med_density = ['Arkansas', 'Iowa', 'Kansas', 'Mississippi', 'Nevada', 'Utah', 'Connecticut', 'Oklahoma', 'Oregon','Kentucky', 'Louisiana','Alabama', 'Colorado', 'South Carolina', 'Maryland', 'Minnesota','Missouri', 'Wisconsin','Arizona','Indiana', 'Massachusetts', 'Tennessee']

med = hm_model_ready2[hm_model_ready2['STATE'].isin(med_density)].drop('Third Party Votes',axis = 1)

In [None]:
med.describe()

In [None]:
ymed = med['GENERAL VOTES'] 
Xmed = med.drop('GENERAL VOTES',axis = 1)

In [None]:
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(Xmed, ymed, test_size=0.3)

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_m.dtypes != np.float)[0]
train_pool = Pool(X_train_m, y_train_m, cat_features=categorical_features_indices)
test_pool = Pool(X_test_m, y_test_m, cat_features=categorical_features_indices)

Cat_m = Catset.fit(X_train_m, y_train_m, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
#Would like to get it down to less than 20,000...
Cat_m.score(X_test_m, y_test_m)

#### Medium-Large State

In [None]:
medlar_density = ['Washington','Virginia','New Jersey','North Carolina','Georgia','Michigan','Ohio']

medlar = hm_model_ready2[hm_model_ready2['STATE'].isin(medlar_density)].drop('Third Party Votes',axis = 1)

In [None]:
medlar.describe()

In [None]:
ymedlar = medlar['GENERAL VOTES'] 
Xmedlar = medlar.drop('GENERAL VOTES',axis = 1)

In [None]:
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(Xmedlar, ymedlar, test_size=0.3)

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_m.dtypes != np.float)[0]
train_pool = Pool(X_train_ml, y_train_ml, cat_features=categorical_features_indices)
test_pool = Pool(X_test_m, y_test_m, cat_features=categorical_features_indices)

Cat_ml = Catset.fit(X_train_ml, y_train_ml, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
Cat_ml.score(X_test_ml, y_test_ml)

In [None]:
Cat_ml.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

##### Modeling Trial 4 - (Breaking DFs down by average voter turnout of the past 10 years)

In [None]:
voter_turnout = hm_model_ready2.groupby(['STATE']).mean().reset_index()
voter_turnout = voter_turnout.sort_values(by=['GENERAL VOTES'])

In [None]:
voter_turnout

In [None]:
plt.barh(voter_turnout['STATE'],voter_turnout['GENERAL VOTES'], color = '#1A62A5')
plt.ylabel('State', fontsize=14)
plt.xlabel('General Election Votes', fontsize=14)
plt.title('Voter Turnout Per State', fontsize=20)

**Grouped states by voter turnout from the past ten years:**<br>

**Smallest:** <br>
- Wyoming, Alaska, Vermont, Delaware, South Dakota, North Dakota, Hawaii, Rhode Island, 

**Medium:** <br>
- *6 Electoral votes:* Arkansas, Iowa ,Kansas, Mississippi, Nevada, Utah
- *7 Electoral votes:* Connecticut, Oklahoma, Oregon
- *8 Electoral votes:* Kentucky, Louisiana
- *9 Electoral votes:* Alabama, Colorado, South Carolina
- *10 Electoral votes:* Maryland, Minnesota, Missouri, Wisconsin


In [None]:
low_turnout = ['Wyoming', 'Alaska', 'Vermont', 'Delaware', 'South Dakota', 'North Dakota', 'Hawaii', 'Rhode Island']

low = hm_model_ready2[hm_model_ready2['STATE'].isin(low_turnout)].drop('Third Party Votes',axis = 1)

In [None]:
low.describe()

**Splitting X&y**

In [None]:
ylow = low['GENERAL VOTES'] 
Xlow = low.drop('GENERAL VOTES',axis = 1)

**T-T-S**

In [None]:
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(Xlow, ylow, test_size=0.3)

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_l.dtypes != np.float)[0]
train_pool = Pool(X_train_l, y_train_l, cat_features=categorical_features_indices)

Cat_l = Catset.fit(X_train_l, y_train_l, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
#Would like to get it down to less than 20,000...
Cat_l.score(X_test_l, y_test_l)

### Pipeline for Prediction

In [215]:
Cat.predict([['10000', '20000', 'Alaska', 'R']])

array([115846.57901672])

In [354]:
#Read in CSV
Primaries_2018 = pd.read_csv('Primary_Results_Predicting_Aug.csv',encoding = "ISO-8859-1")

In [355]:
#Assigning a year
Primaries_2018['YEAR'] = '2018'

In [356]:
#IDs to merge on later
Primaries_2018["ID"] = Primaries_2018["State"].map(str) + Primaries_2018["YEAR"] +Primaries_2018["Party"]

In [357]:
Primaries_party = Primaries_2018[Primaries_2018['Party'].isin(['R','D'])]

In [358]:
Primaries_party

Unnamed: 0,Date,State,Candidate Name,Number of Votes,Percentage,(I),Party,YEAR,ID
0,6-Mar,Texas,Beto O'Rourke,641324.0,0.62,0.0,D,2018,Texas2018D
1,6-Mar,Texas,Sema Hernandez,246308.0,23.7,0.0,D,2018,Texas2018D
2,6-Mar,Texas,Edward Kimbrough,150147.0,14.5,0.0,D,2018,Texas2018D
3,6-Mar,Texas,Ted Cruz*,1317450.0,0.85,1.0,R,2018,Texas2018R
4,6-Mar,Texas,Mary Miller,94451.0,6.1,0.0,R,2018,Texas2018R
5,6-Mar,Texas,Bruce Jacobson,64604.0,4.2,0.0,R,2018,Texas2018R
6,6-Mar,Texas,Stefano de Stefano,44327.0,2.9,0.0,R,2018,Texas2018R
7,6-Mar,Texas,Geraldine Sam,22842.0,1.5,0.0,R,2018,Texas2018R
8,8-May,Indiana,Joe Donelly,0.0,0,1.0,D,2018,Indiana2018D
9,8-May,Indiana,Mike Braun,208505.0,0.41,0.0,R,2018,Indiana2018R


** Totaling the Number of votes in primary **

In [359]:
primaries_partytotal = Primaries_party[['ID','Number of Votes']]

primaries_partytotal = primaries_partytotal.dropna()

primaries_partytotal["Number of Votes"]= primaries_partytotal["Number of Votes"].astype(float)

primaries_partytotal = primaries_partytotal.groupby(['ID']).sum().reset_index()

primaries_partytotal = primaries_partytotal.rename(columns = {'Number of Votes':'Total Party Votes'})

In [360]:
primaries_partytotal

Unnamed: 0,ID,Total Party Votes
0,Arizona2018D,390236.0
1,Arizona2018R,498804.0
2,California2018D,2876479.0
3,California2018R,1573388.0
4,Connecticut2018D,0.0
5,Connecticut2018R,130155.0
6,Florida2018D,0.0
7,Florida2018R,1639588.0
8,Hawaii2018D,0.0
9,Hawaii2018R,26826.0


** Counting the Number of Candidates **

In [361]:
NumRunning_p = Primaries_party.groupby(['ID']).count().reset_index()

NumRunning_p = NumRunning_p[['ID','Number of Votes']]

NumRunning_p = NumRunning_p.rename(columns = {'Number of Votes':'Candidate Count'})

In [362]:
NumRunning_p

Unnamed: 0,ID,Candidate Count
0,Arizona2018D,2
1,Arizona2018R,3
2,California2018D,10
3,California2018R,11
4,Connecticut2018D,1
5,Connecticut2018R,2
6,Florida2018D,1
7,Florida2018R,2
8,Hawaii2018D,1
9,Hawaii2018R,8


** Inumbent Log **

In [363]:
Incumbents = Primaries_party.groupby(['ID']).sum().reset_index()

In [364]:
Incumbents = Incumbents[['ID','(I)']]

** Max votes per party per state **

In [365]:
PartyWin_p = Primaries_party[['ID','Number of Votes','State','YEAR','Party']]

PartyWin_p = PartyWin_p.dropna()

PartyWin_p["Number of Votes"]= PartyWin_p["Number of Votes"].astype(float)

PartyWin_p = PartyWin_p.groupby(['ID']).max().reset_index()

PartyWin_p = PartyWin_p.rename(columns = {'Number of Votes':'Primary Winner'})

In [366]:
PartyWin_p

Unnamed: 0,ID,Primary Winner,State,YEAR,Party
0,Arizona2018D,314108.0,Arizona,2018,D
1,Arizona2018R,263734.0,Arizona,2018,R
2,California2018D,2031967.0,California,2018,D
3,California2018R,398477.0,California,2018,R
4,Connecticut2018D,0.0,Connecticut,2018,D
5,Connecticut2018R,99624.0,Connecticut,2018,R
6,Florida2018D,0.0,Florida,2018,D
7,Florida2018R,1452952.0,Florida,2018,R
8,Hawaii2018D,0.0,Hawaii,2018,D
9,Hawaii2018R,6365.0,Hawaii,2018,R


** Merging columns back together **

In [367]:
votes_merged_p = primaries_partytotal.merge(NumRunning_p, left_on='ID', right_on='ID', how='left')

votes_merged_p = votes_merged_p.merge(PartyWin_p, left_on='ID', right_on='ID', how = 'left')

votes_merged_p = votes_merged_p.merge(Incumbents, left_on='ID', right_on='ID', how ='left')

In [368]:
votes_merged_p

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I)
0,Arizona2018D,390236.0,2,314108.0,Arizona,2018,D,0.0
1,Arizona2018R,498804.0,3,263734.0,Arizona,2018,R,0.0
2,California2018D,2876479.0,10,2031967.0,California,2018,D,1.0
3,California2018R,1573388.0,11,398477.0,California,2018,R,0.0
4,Connecticut2018D,0.0,1,0.0,Connecticut,2018,D,1.0
5,Connecticut2018R,130155.0,2,99624.0,Connecticut,2018,R,0.0
6,Florida2018D,0.0,1,0.0,Florida,2018,D,1.0
7,Florida2018R,1639588.0,2,1452952.0,Florida,2018,R,0.0
8,Hawaii2018D,0.0,1,0.0,Hawaii,2018,D,0.01
9,Hawaii2018R,26826.0,8,6365.0,Hawaii,2018,R,0.0


** Counting total votes by loser **

In [369]:
votes_merged_p['Primary Loser Vote'] = votes_merged_p['Total Party Votes'] - votes_merged_p['Primary Winner']

In [370]:
votes_merged_p

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I),Primary Loser Vote
0,Arizona2018D,390236.0,2,314108.0,Arizona,2018,D,0.0,76128.0
1,Arizona2018R,498804.0,3,263734.0,Arizona,2018,R,0.0,235070.0
2,California2018D,2876479.0,10,2031967.0,California,2018,D,1.0,844512.0
3,California2018R,1573388.0,11,398477.0,California,2018,R,0.0,1174911.0
4,Connecticut2018D,0.0,1,0.0,Connecticut,2018,D,1.0,0.0
5,Connecticut2018R,130155.0,2,99624.0,Connecticut,2018,R,0.0,30531.0
6,Florida2018D,0.0,1,0.0,Florida,2018,D,1.0,0.0
7,Florida2018R,1639588.0,2,1452952.0,Florida,2018,R,0.0,186636.0
8,Hawaii2018D,0.0,1,0.0,Hawaii,2018,D,0.01,0.0
9,Hawaii2018R,26826.0,8,6365.0,Hawaii,2018,R,0.0,20461.0


** Adding if party is in the white house **

In [371]:
OP2018 = []

for party in votes_merged_p['Party']:
    if party == 'R':
        OP2018.append(1)
        
    else:
        OP2018.append(0)
    
votes_merged_p['OFFICE PARTY'] = OP2018

** Seperating out Unopposed **

In [372]:
Unopp2018 = votes_merged_p.drop(votes_merged_p[votes_merged_p['Primary Winner'] > 1].index)

In [373]:
Unopp2018

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I),Primary Loser Vote,OFFICE PARTY
4,Connecticut2018D,0.0,1,0.0,Connecticut,2018,D,1.0,0.0,0
6,Florida2018D,0.0,1,0.0,Florida,2018,D,1.0,0.0,0
8,Hawaii2018D,0.0,1,0.0,Hawaii,2018,D,0.01,0.0,0
10,Indiana2018D,0.0,1,0.0,Indiana,2018,D,1.0,0.0,0
12,Maine2018D,0.0,1,0.0,Maine,2018,D,0.0,0.0,0
13,Maine2018R,0.0,1,0.0,Maine,2018,R,0.0,0.0,1
16,Michigan2018D,0.0,1,0.0,Michigan,2018,D,1.0,0.0,0
24,Montana2018D,0.0,1,0.0,Montana,2018,D,1.0,0.0,0
32,New Mexico2018D,0.0,1,0.0,New Mexico,2018,D,1.0,0.0,0
33,New Mexico2018R,0.0,1,0.0,New Mexico,2018,R,0.0,0.0,1


** The races we can do estimates for **

In [374]:
fin2018_Aug = votes_merged_p.drop(votes_merged_p[votes_merged_p['Primary Winner'] < 1].index)

In [375]:
fin2018_Aug

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I),Primary Loser Vote,OFFICE PARTY
0,Arizona2018D,390236.0,2,314108.0,Arizona,2018,D,0.0,76128.0,0
1,Arizona2018R,498804.0,3,263734.0,Arizona,2018,R,0.0,235070.0,1
2,California2018D,2876479.0,10,2031967.0,California,2018,D,1.0,844512.0,0
3,California2018R,1573388.0,11,398477.0,California,2018,R,0.0,1174911.0,1
5,Connecticut2018R,130155.0,2,99624.0,Connecticut,2018,R,0.0,30531.0,1
7,Florida2018R,1639588.0,2,1452952.0,Florida,2018,R,0.0,186636.0,1
9,Hawaii2018R,26826.0,8,6365.0,Hawaii,2018,R,0.0,20461.0,1
11,Indiana2018R,506492.0,3,208505.0,Indiana,2018,R,0.0,297987.0,1
14,Maryland2018D,560477.0,8,450890.0,Maryland,2018,D,1.0,109587.0,0
15,Maryland2018R,169047.0,11,49428.0,Maryland,2018,R,0.0,119619.0,1


** Running those estimates!!! ** <3

In [376]:
fin2018_Aug = fin2018_Aug.rename(index=str, columns={"State": "STATE", "Party": "PARTY","OFFICE PARTY":'Party In House'})

In [346]:
# Used in prediction

X_trainc2.head(5)

Unnamed: 0,Primary Winner,Primary Loser Vote,STATE,PARTY
362,213753.0,2.137531e+19,Kentucky,R
150,2566298.0,0.0,California,D
181,544830.0,5.448301e+16,Missouri,D
113,163817.0,1.638172e+20,New Jersey,R
306,206986.0,2.069861e+27,Kentucky,R


In [377]:
#We have to re-order so the columns are structured the same way as the input. 
fin2018_Aug = fin2018_Aug[['Primary Winner', 'Primary Loser Vote', 'STATE', 'PARTY']]

In [378]:
fin2018_Aug['Predicted Turnout'] = Cat.predict(fin2018_Aug)

In [379]:
fin2018_Aug

Unnamed: 0,Primary Winner,Primary Loser Vote,STATE,PARTY,Predicted Turnout
0,314108.0,76128.0,Arizona,D,1234519.0
1,263734.0,235070.0,Arizona,R,1123673.0
2,2031967.0,844512.0,California,D,4926918.0
3,398477.0,1174911.0,California,R,1455552.0
5,99624.0,30531.0,Connecticut,R,349934.1
7,1452952.0,186636.0,Florida,R,3808680.0
9,6365.0,20461.0,Hawaii,R,88189.69
11,208505.0,297987.0,Indiana,R,1066640.0
14,450890.0,109587.0,Maryland,D,1410571.0
15,49428.0,119619.0,Maryland,R,577706.1


In [380]:
fin2018_Aug.to_csv('Fin2018_Aug.csv')