In [1]:
import pandas as pd
import csv
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
%matplotlib inline
# from paramsearch import paramsearch
from itertools import product,chain

### Data Aggregation

#### Prepping Files 

Reading the CSV files

In [2]:
hm2016 = pd.read_csv('federalelections2016 (2).csv')
hm2014 = pd.read_csv('federalelections2014.csv')
hm2012 = pd.read_csv('Senate_Primary_2012.csv')
hm2010 = pd.read_csv('Primary2010.csv')
hm2008 = pd.read_csv('Primary2008.csv')
hm2006 = pd.read_csv('Primary2006.csv')
hm2004 = pd.read_csv('FederalElection2004.csv')


### Add back incumbent column!!

**Adding the year to each CSV for easier analysis**

In [3]:
hm2016['YEAR'] = '2016'
hm2014['YEAR'] = '2014'
hm2012['YEAR'] = '2012'
hm2010['YEAR'] = '2010'
hm2008['YEAR'] = '2008'
hm2006['YEAR'] = '2006'
hm2004['YEAR'] = '2004'

**Creating IDs I will later merge on**

In [4]:
hm2016["ID"] = hm2016["STATE"].map(str) + hm2016["YEAR"] +hm2016["PARTY"]
hm2016["ID2"] = hm2016["STATE"].map(str) + hm2016["YEAR"]

hm2014["ID"] = hm2014["STATE"].map(str) + hm2014["YEAR"] +hm2014["PARTY"]
hm2014["ID2"] = hm2014["STATE"].map(str) + hm2014["YEAR"]

hm2012["ID"] = hm2012["STATE"].map(str) + hm2012["YEAR"] +hm2012["PARTY"]
hm2012["ID2"] = hm2012["STATE"].map(str) + hm2012["YEAR"]

hm2010["ID"] = hm2010["STATE"].map(str) + hm2010["YEAR"] +hm2010["PARTY"]
hm2010["ID2"] = hm2010["STATE"].map(str) + hm2010["YEAR"]

hm2008["ID"] = hm2008["STATE"].map(str) + hm2008["YEAR"] +hm2008["PARTY"]
hm2008["ID2"] = hm2008["STATE"].map(str) + hm2008["YEAR"]

hm2006["ID"] = hm2006["STATE"].map(str) + hm2006["YEAR"] +hm2006["PARTY"]
hm2006["ID2"] = hm2006["STATE"].map(str) + hm2006["YEAR"]

hm2004["ID"] = hm2004["STATE"].map(str) + hm2004["YEAR"] +hm2004["PARTY"]
hm2004["ID2"] = hm2004["STATE"].map(str) + hm2004["YEAR"]


**Dropping weird columns & fixing weird merging errors**

In [5]:
hm2014 = hm2014.drop('Unnamed: 8', axis=1).drop('Unnamed: 9',axis=1)

In [None]:
# hm2012 = hm2012.drop('Unnamed: 9', axis=1)

In [None]:
# hm2010.columns = [col.strip() for col in hm2010.columns]

In [6]:
hm2012.columns = [col.strip() for col in hm2012.columns]

In [7]:
hm2014.columns = [col.strip() for col in hm2014.columns]

In [8]:
hm2016.columns = [col.strip() for col in hm2016.columns]

**Append files together**

In [9]:
hm = hm2016.append(hm2012).append(hm2004).append(hm2006).append(hm2008).append(hm2010).append(hm2014)

In [None]:
# ## Want to make "Incument" versus not a binary count.  
# hm = hm.replace(to_replace = 'FULL TERM',value='(I)').replace(to_replace = 'UNEXPIRED TERM ',value='(I)').replace(to_replace=' (I)',value='(I)').replace(to_replace='UNEXPIRED TERM',value='(I)')

#### Party data

**Removing all third party candidates**

In [10]:
hm = hm.replace(to_replace = 'R*',value='R').replace(to_replace = 'R ',value='R').replace(to_replace = 'D*',value='D').replace(to_replace = 'D* ',value='D').replace(to_replace = 'D ',value='D')

In [11]:
hm_party = hm[hm['PARTY'].isin(['R','D'])]

**Throwing out rows that are interupting analysis**

In [12]:
hm_party2 = hm_party[~hm_party['PRIMARY VOTES'].isin(['Unopposed','*','#','14*','20*','1,040*','1,616**','Loser','Winner','Withdrew','Unoppsed'])]

In [13]:
hm_party2 = hm_party2[~hm_party2['TOTAL VOTES'].isin(['Party Votes:','Total State Votes:'])]

In [14]:
hm_party2 = hm_party2[~hm_party2['ID'].isin(['South Carolina2014R','Wyoming2008R'])]

**Find total votes for each party, each state **

In [None]:
#Didn't work :( It's missing a few states.
# hm_partytotal = hm_party[hm_party['TOTAL VOTES'].isin(['Party Votes:'])]
# hm_partytotal = hm_partytotal[['ID','PRIMARY VOTES']]

In [15]:
hm_partytotal = hm_party2[['ID','PRIMARY VOTES']]

In [16]:
hm_partytotal = hm_partytotal.dropna()

In [17]:
hm_partytotal["PRIMARY VOTES"]=hm_partytotal["PRIMARY VOTES"].astype(float)

In [18]:
hm_partytotal = hm_partytotal.groupby(['ID']).sum().reset_index()

In [19]:
hm_partytotal = hm_partytotal.rename(columns = {'PRIMARY VOTES':'Total Party Votes'})

**Finding How many people ran in the primary**

In [20]:
NumRunning = hm_party2.groupby(['ID']).count().reset_index()

In [21]:
NumRunning = NumRunning[['ID','PRIMARY VOTES','(I)']]

In [22]:
NumRunning['(I)'].value_counts()

0    259
1    145
Name: (I), dtype: int64

In [23]:
NumRunning = NumRunning.rename(columns = {'PRIMARY VOTES':'Candidate Count'})

**Finding max vote for each party, each state**

In [24]:
PartyWin = hm_party2[['ID','PRIMARY VOTES','STATE','YEAR','PARTY']]

In [25]:
PartyWin = PartyWin.dropna()

In [26]:
PartyWin["PRIMARY VOTES"]= PartyWin["PRIMARY VOTES"].astype(float)

In [27]:
PartyWin = PartyWin.groupby(['ID']).max().reset_index()

In [28]:
PartyWin = PartyWin.rename(columns = {'PRIMARY VOTES':'Primary Winner'})

-Group by state and party
-Tuple of the candidates aggregated 
-Winner, candidates, number of columns


**Merging Columns Back Together**

In [29]:
votes_merged = hm_partytotal.merge(NumRunning, left_on='ID', right_on='ID', how='left')

In [30]:
votes_merged = votes_merged.merge(PartyWin, left_on='ID', right_on='ID', how = 'left')

**Calcuating the total number of votes by losers**

In [31]:
votes_merged['Primary Loser Vote'] = votes_merged['Total Party Votes'] - votes_merged['Primary Winner']

#### Final Results Data

Creating column with final election results numbers

In [32]:
hm_finals = hm[hm['PARTY'].isin(['R','D'])]

In [33]:
hm_finals = hm_finals[['GENERAL VOTES','ID','ID2']]

In [34]:
hm_finals = hm_finals.dropna()

#### 3rd Party Data #Not Including this anymore

Creating column with how many people showed up to vote for the third party candidate in the primaries. 

In [35]:
hm_third = hm[~hm['PARTY'].isin(['R','D'])]

In [36]:
hm_third = hm_third[hm_third['TOTAL VOTES'].isin(['Party Votes:'])]

In [37]:
hm_third = hm_third[['PRIMARY VOTES','ID2']]

In [38]:
hm_third = hm_third.groupby(['ID2'])
hm_third = hm_third['PRIMARY VOTES'].agg([np.sum]).reset_index()

In [39]:
hm_third = hm_third.rename(columns = {'sum':'Third Party Votes'})

#### Merging

Merging the final results data and the third party data to the original dataframe.

In [40]:
hm_merged = hm_finals.merge(votes_merged, left_on='ID', right_on='ID', how='left')

In [None]:
# hm_merged = hm_merged.merge(hm_third, left_on='ID2', right_on='ID2', how='left')

In [None]:
# hm_merged['Third Party Votes'] = hm_merged['Third Party Votes'].fillna(value=0)

** Throwing out NaNs for races that didn't have primary data**

In [41]:
hm_merged = hm_merged.dropna()

**Making a loop for whether or not a candidate's party is in the oval office.**

In [42]:
Republican = ['2008','2006','2004','2002','1992','1990']

OfficeParty = []


for year in hm_merged['YEAR']:
    if year in Republican:
        OfficeParty.append('R')
        
    else:
        OfficeParty.append('D')
    
hm_merged['OFFICE PARTY'] = OfficeParty

** Create Column for whether or not the party is in office**

In [43]:
hm_merged['Party In House'] = np.where(hm_merged['PARTY']== hm_merged['OFFICE PARTY'], 1,0)

In [44]:
hm_model_ready = hm_merged.drop('ID',axis=1).drop('ID2',axis=1).drop('OFFICE PARTY',axis=1)

In [45]:
# hm_model_ready = hm_merged.drop('ID',axis=1).drop('ID2',axis=1).drop('Total Party Votes',axis=1).drop('YEAR',axis=1).drop('OFFICE PARTY',axis=1)

In [46]:
hm_model_ready = hm_model_ready[~hm_model_ready['GENERAL VOTES'].isin(['#'])]

In [47]:
hm_model_ready

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
0,1.3351e+06,778851.0,5.0,1.0,505586.0,Alabama,2016,R,273265.0,0
1,748709,274423.0,2.0,0.0,153897.0,Alabama,2016,D,120526.0,1
2,138149,55293.0,4.0,1.0,39545.0,Alaska,2016,R,15748.0,0
3,36200,25318.0,2.0,0.0,15228.0,Alaska,2016,D,10090.0,1
4,1.35927e+06,591155.0,4.0,1.0,302532.0,Arizona,2016,R,288623.0,0
5,1.03124e+06,333586.0,1.0,0.0,333586.0,Arizona,2016,D,0.0,1
6,661984,389834.0,2.0,1.0,298039.0,Arkansas,2016,R,91795.0,0
8,7.54276e+06,4811300.0,7.0,0.0,3000689.0,California,2016,D,1810611.0,1
9,4.70142e+06,4811300.0,7.0,0.0,3000689.0,California,2016,D,1810611.0,1
10,1.37071e+06,262344.0,1.0,1.0,262344.0,Colorado,2016,D,0.0,1


In [48]:
hm_model_ready["GENERAL VOTES"]= hm_model_ready["GENERAL VOTES"].astype(float)
# hm_model_ready["Third Party Votes"]= hm_model_ready["Third Party Votes"].astype(float)
hm_model_ready["Party In House"]= hm_model_ready["Party In House"].astype(float)

### Modeling (All 50 States)

#### EDA & Futhur Cleaning

In [49]:
hm_model_ready.describe()

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,Primary Loser Vote,Party In House
count,337.0,337.0,337.0,337.0,337.0,337.0,337.0
mean,1121743.0,424836.8,3.424332,0.37092,295909.1,128927.7,0.492582
std,1195753.0,560771.9,2.572936,0.483769,395804.4,223326.1,0.500688
min,29377.0,6934.0,1.0,0.0,6110.0,0.0,0.0
25%,286409.0,101728.0,2.0,0.0,70424.0,3773.0,0.0
50%,806787.0,247902.0,3.0,0.0,166627.0,46028.0,0.0
75%,1479471.0,561952.0,5.0,1.0,389613.0,153120.0,1.0
max,7864624.0,4811300.0,18.0,1.0,3000689.0,1810611.0,1.0


In [50]:
hm_model_ready.sort_values(by=['GENERAL VOTES'])

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
414,29377.0,14870.0,4.0,0.0,7200.0,Wyoming,2014,D,7670.0,1.0
3,36200.0,25318.0,2.0,0.0,15228.0,Alaska,2016,D,10090.0,1.0
141,53019.0,16850.0,3.0,0.0,9173.0,Wyoming,2012,D,7677.0,1.0
273,57671.0,24924.0,1.0,0.0,24924.0,Wyoming,2006,D,0.0,0.0
277,60045.0,30287.0,3.0,0.0,18035.0,Alaska,2010,D,12252.0,1.0
250,64417.0,40647.0,1.0,0.0,40647.0,North Dakota,2006,R,0.0,1.0
219,69734.0,11881.0,2.0,0.0,6110.0,Delaware,2006,R,5771.0,1.0
335,72699.0,22750.0,1.0,0.0,22750.0,Vermont,2010,R,0.0,0.0
131,72898.0,8442.0,2.0,0.0,6358.0,Vermont,2012,R,2084.0,0.0
206,75398.0,13364.0,3.0,0.0,9591.0,Vermont,2004,R,3773.0,1.0


In [51]:
#Looks like there are some rogue additional states...
len(hm_model_ready['STATE'].value_counts())

51

In [52]:
hm_model_ready['STATE'].unique()

##Virgin Islands not in state list
##Missouri counted twice due to space error

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois',
       'Indiana', 'Iowa', 'Kentucky', 'Maryland', 'Missouri', 'Nevada',
       'New Hampshire', 'North Carolina', 'North Dakota', 'Ohio',
       'Oregon', 'Pennsylvania', 'Utah', 'Vermont', 'Washington',
       'Wisconsin', 'Connecticut', 'Delaware', 'Maine', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri ', 'Montana',
       'Nebraska', 'New Jersey', 'New Mexico', 'New York', 'Rhode Island',
       'Tennessee', 'Texas', 'Virginia', 'West Virginia', 'Wyoming',
       'Kansas', 'Oklahoma', 'South Carolina', 'Louisiana',
       'South Dakota'], dtype=object)

In [53]:
#Dropping VI
hm_model_ready2 = hm_model_ready[~hm_model_ready['STATE'].isin(['Virgin Islands'])]

In [54]:
#Removing spaces

hm_model_ready2 = hm_model_ready2.replace(to_replace = 'Missouri ',value='Missouri')

In [55]:
len(hm_model_ready2['STATE'].unique())

50

In [56]:
hm_model_ready2['(I)'].value_counts()

0.0    212
1.0    125
Name: (I), dtype: int64

In [57]:
hm_model_ready2.groupby(['STATE']).sum().sort_values(['GENERAL VOTES'])

Unnamed: 0_level_0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,Primary Loser Vote,Party In House
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
South Dakota,140741.0,74498.0,5.0,0.0,41377.0,33121.0,0.0
Delaware,557025.0,143109.0,8.0,1.0,98441.0,44668.0,2.0
Wyoming,582045.0,311266.0,17.0,3.0,270989.0,40277.0,3.0
North Dakota,771729.0,350801.0,6.0,1.0,316588.0,34213.0,2.0
Alaska,880306.0,460223.0,23.0,3.0,318109.0,142114.0,4.0
Vermont,970052.0,278217.0,14.0,3.0,252712.0,25505.0,4.0
Kansas,1091200.0,438904.0,4.0,1.0,347891.0,91013.0,1.0
Rhode Island,1118726.0,339786.0,9.0,3.0,294531.0,45255.0,3.0
Louisiana,1191987.0,207481.0,6.0,1.0,163034.0,44447.0,1.0
Montana,1212195.0,641718.0,18.0,3.0,489773.0,151945.0,3.0


**Seeing if we throw out the rest of the unopposed if our score gets better...**
It doesn't!

In [None]:
# moreopp = hm_model_ready2[hm_model_ready2['Primary Loser Vote'] == 0]

In [None]:
# #Dropping Unnopposed to see if it makes it better...
# hm_model_ready2 = hm_model_ready2[~hm_model_ready['Primary Loser Vote'].isin([0])]

#### Catboost (With All Inputs Included) ***THIS ONE PREVAILS!! :)

** Splitting x&y **

In [58]:
hm_model_ready2

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
0,1335104.0,778851.0,5.0,1.0,505586.0,Alabama,2016,R,273265.0,0.0
1,748709.0,274423.0,2.0,0.0,153897.0,Alabama,2016,D,120526.0,1.0
2,138149.0,55293.0,4.0,1.0,39545.0,Alaska,2016,R,15748.0,0.0
3,36200.0,25318.0,2.0,0.0,15228.0,Alaska,2016,D,10090.0,1.0
4,1359267.0,591155.0,4.0,1.0,302532.0,Arizona,2016,R,288623.0,0.0
5,1031245.0,333586.0,1.0,0.0,333586.0,Arizona,2016,D,0.0,1.0
6,661984.0,389834.0,2.0,1.0,298039.0,Arkansas,2016,R,91795.0,0.0
8,7542759.0,4811300.0,7.0,0.0,3000689.0,California,2016,D,1810611.0,1.0
9,4701417.0,4811300.0,7.0,0.0,3000689.0,California,2016,D,1810611.0,1.0
10,1370710.0,262344.0,1.0,1.0,262344.0,Colorado,2016,D,0.0,1.0


In [169]:
ycat = hm_model_ready2['GENERAL VOTES'] 
Xcat = hm_model_ready2.drop('GENERAL VOTES',axis = 1)

** Splitting for T-T-S **

In [170]:
X_trainc, X_testc, y_trainc, y_testc = train_test_split(Xcat, ycat, test_size=0.3, random_state=42)

In [171]:
X_trainc.columns

Index(['Total Party Votes', 'Candidate Count', '(I)', 'Primary Winner',
       'STATE', 'YEAR', 'PARTY', 'Primary Loser Vote', 'Party In House'],
      dtype='object')

** Running Catboost **

In [172]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_trainc.dtypes != np.float)[0]
train_pool = Pool(X_trainc, y_trainc, cat_features=categorical_features_indices)
test_pool = Pool(X_testc, y_testc, cat_features=categorical_features_indices)


Catset=CatBoostRegressor(learning_rate = 0.01, depth = 10)
Cat = Catset.fit(X_trainc, y_trainc, cat_features=categorical_features_indices, plot=True, use_best_model=True)

You should provide test set for use best model. use_best_model parameter swiched to false value.


0:	learn: 1686834.8391308	total: 14.7ms	remaining: 14.7s
1:	learn: 1674869.7549455	total: 17.2ms	remaining: 8.58s
2:	learn: 1663986.3509863	total: 47.9ms	remaining: 15.9s
3:	learn: 1655140.2014902	total: 74ms	remaining: 18.4s
4:	learn: 1644948.2314753	total: 87.1ms	remaining: 17.3s
5:	learn: 1633024.0415823	total: 90.8ms	remaining: 15s
6:	learn: 1623073.0224543	total: 101ms	remaining: 14.4s
7:	learn: 1612080.8830805	total: 103ms	remaining: 12.8s
8:	learn: 1600380.3156420	total: 106ms	remaining: 11.7s
9:	learn: 1590739.2289843	total: 111ms	remaining: 11s
10:	learn: 1580853.7157839	total: 136ms	remaining: 12.2s
11:	learn: 1569956.4167775	total: 137ms	remaining: 11.3s
12:	learn: 1559610.2344647	total: 140ms	remaining: 10.6s
13:	learn: 1548889.6310343	total: 144ms	remaining: 10.1s
14:	learn: 1538590.7810845	total: 148ms	remaining: 9.74s
15:	learn: 1529946.0206747	total: 175ms	remaining: 10.8s
16:	learn: 1519848.4614129	total: 178ms	remaining: 10.3s
17:	learn: 1509501.8871486	total: 180ms	r

160:	learn: 757541.3962834	total: 1000ms	remaining: 5.21s
161:	learn: 755705.8494494	total: 1s	remaining: 5.18s
162:	learn: 753080.8008117	total: 1s	remaining: 5.15s
163:	learn: 750342.2118700	total: 1.01s	remaining: 5.13s
164:	learn: 748337.5976156	total: 1.01s	remaining: 5.12s
165:	learn: 745842.5692011	total: 1.01s	remaining: 5.1s
166:	learn: 743398.3977349	total: 1.02s	remaining: 5.08s
167:	learn: 741454.5956019	total: 1.02s	remaining: 5.06s
168:	learn: 739350.0092568	total: 1.03s	remaining: 5.06s
169:	learn: 737567.0548688	total: 1.03s	remaining: 5.04s
170:	learn: 735877.5776974	total: 1.03s	remaining: 5.02s
171:	learn: 733873.2885115	total: 1.04s	remaining: 4.99s
172:	learn: 731788.8765130	total: 1.04s	remaining: 4.97s
173:	learn: 729556.8702995	total: 1.06s	remaining: 5.05s
174:	learn: 727983.4296636	total: 1.07s	remaining: 5.03s
175:	learn: 725739.6490891	total: 1.09s	remaining: 5.1s
176:	learn: 723727.8016844	total: 1.09s	remaining: 5.07s
177:	learn: 721744.4697494	total: 1.09

311:	learn: 560024.5016987	total: 1.99s	remaining: 4.38s
312:	learn: 559575.4130647	total: 1.99s	remaining: 4.36s
313:	learn: 558644.7151699	total: 2s	remaining: 4.37s
314:	learn: 558183.9948555	total: 2s	remaining: 4.36s
315:	learn: 557794.2862766	total: 2s	remaining: 4.34s
316:	learn: 557333.5158635	total: 2.01s	remaining: 4.33s
317:	learn: 556912.8874055	total: 2.01s	remaining: 4.31s
318:	learn: 556348.8345515	total: 2.02s	remaining: 4.3s
319:	learn: 555939.5032065	total: 2.02s	remaining: 4.3s
320:	learn: 555561.4687293	total: 2.02s	remaining: 4.28s
321:	learn: 554764.9937763	total: 2.03s	remaining: 4.28s
322:	learn: 554017.1475592	total: 2.04s	remaining: 4.27s
323:	learn: 553370.3491417	total: 2.04s	remaining: 4.26s
324:	learn: 552675.1248948	total: 2.04s	remaining: 4.24s
325:	learn: 551929.2042077	total: 2.05s	remaining: 4.23s
326:	learn: 551405.1662167	total: 2.05s	remaining: 4.22s
327:	learn: 550310.5916044	total: 2.07s	remaining: 4.25s
328:	learn: 549985.4730999	total: 2.07s	re

482:	learn: 483653.1962987	total: 2.8s	remaining: 3s
483:	learn: 483386.4163451	total: 2.81s	remaining: 2.99s
484:	learn: 483194.9932935	total: 2.81s	remaining: 2.98s
485:	learn: 482978.3912335	total: 2.81s	remaining: 2.97s
486:	learn: 482779.8785932	total: 2.81s	remaining: 2.96s
487:	learn: 482322.8539555	total: 2.81s	remaining: 2.95s
488:	learn: 482099.9220908	total: 2.81s	remaining: 2.94s
489:	learn: 482036.5693686	total: 2.82s	remaining: 2.93s
490:	learn: 481524.0875336	total: 2.83s	remaining: 2.93s
491:	learn: 481060.9834763	total: 2.85s	remaining: 2.95s
492:	learn: 480868.9031481	total: 2.86s	remaining: 2.94s
493:	learn: 480686.6368433	total: 2.86s	remaining: 2.93s
494:	learn: 480638.4992258	total: 2.86s	remaining: 2.92s
495:	learn: 480520.8123816	total: 2.87s	remaining: 2.91s
496:	learn: 479550.5054370	total: 2.89s	remaining: 2.92s
497:	learn: 479169.8369259	total: 2.9s	remaining: 2.92s
498:	learn: 478991.8752496	total: 2.9s	remaining: 2.91s
499:	learn: 478653.2898560	total: 2.9

628:	learn: 451680.3059375	total: 3.47s	remaining: 2.05s
629:	learn: 451583.1713961	total: 3.48s	remaining: 2.04s
630:	learn: 451480.0519136	total: 3.48s	remaining: 2.03s
631:	learn: 451339.0860919	total: 3.48s	remaining: 2.03s
632:	learn: 451244.7962383	total: 3.48s	remaining: 2.02s
633:	learn: 451137.0130766	total: 3.49s	remaining: 2.01s
634:	learn: 450554.6510617	total: 3.5s	remaining: 2.01s
635:	learn: 450319.3113377	total: 3.5s	remaining: 2s
636:	learn: 450249.9508727	total: 3.5s	remaining: 2s
637:	learn: 450109.2843045	total: 3.5s	remaining: 1.99s
638:	learn: 450055.3428498	total: 3.51s	remaining: 1.98s
639:	learn: 449562.6758108	total: 3.52s	remaining: 1.98s
640:	learn: 449476.5505052	total: 3.52s	remaining: 1.97s
641:	learn: 449389.5293999	total: 3.53s	remaining: 1.97s
642:	learn: 449243.5114935	total: 3.53s	remaining: 1.96s
643:	learn: 448942.5153317	total: 3.54s	remaining: 1.95s
644:	learn: 448877.9668287	total: 3.54s	remaining: 1.95s
645:	learn: 448581.3786542	total: 3.55s	r

774:	learn: 422129.4107125	total: 4.53s	remaining: 1.32s
775:	learn: 421950.1240209	total: 4.54s	remaining: 1.31s
776:	learn: 421726.5861410	total: 4.56s	remaining: 1.31s
777:	learn: 421054.0388662	total: 4.58s	remaining: 1.3s
778:	learn: 420802.3362920	total: 4.59s	remaining: 1.3s
779:	learn: 420673.1075174	total: 4.61s	remaining: 1.3s
780:	learn: 420586.8316468	total: 4.61s	remaining: 1.29s
781:	learn: 420392.7629647	total: 4.61s	remaining: 1.29s
782:	learn: 420320.9128411	total: 4.62s	remaining: 1.28s
783:	learn: 420125.3980927	total: 4.64s	remaining: 1.28s
784:	learn: 420033.5853872	total: 4.65s	remaining: 1.27s
785:	learn: 419837.9958637	total: 4.67s	remaining: 1.27s
786:	learn: 419107.9911696	total: 4.68s	remaining: 1.27s
787:	learn: 418786.3167654	total: 4.7s	remaining: 1.26s
788:	learn: 418390.9128388	total: 4.72s	remaining: 1.26s
789:	learn: 418146.9735393	total: 4.72s	remaining: 1.25s
790:	learn: 417846.5971634	total: 4.75s	remaining: 1.25s
791:	learn: 417414.2326013	total: 4

923:	learn: 389306.4847614	total: 6.75s	remaining: 555ms
924:	learn: 389147.3385797	total: 6.77s	remaining: 549ms
925:	learn: 389106.6701232	total: 6.78s	remaining: 542ms
926:	learn: 389025.1875943	total: 6.79s	remaining: 535ms
927:	learn: 388907.4559555	total: 6.81s	remaining: 528ms
928:	learn: 388854.9290970	total: 6.81s	remaining: 521ms
929:	learn: 388809.5431961	total: 6.82s	remaining: 513ms
930:	learn: 388783.0904220	total: 6.82s	remaining: 506ms
931:	learn: 388537.3388487	total: 6.85s	remaining: 500ms
932:	learn: 388387.5050735	total: 6.87s	remaining: 493ms
933:	learn: 387903.1982496	total: 6.88s	remaining: 486ms
934:	learn: 387786.6386925	total: 6.89s	remaining: 479ms
935:	learn: 387637.7218999	total: 6.91s	remaining: 472ms
936:	learn: 387382.9321550	total: 6.96s	remaining: 468ms
937:	learn: 387353.4669339	total: 6.96s	remaining: 460ms
938:	learn: 387024.2086206	total: 6.99s	remaining: 454ms
939:	learn: 386973.2692485	total: 6.99s	remaining: 446ms
940:	learn: 386791.8022574	tota

In [173]:
## Checking out feature importance for this model. 

feature_importances = Cat.get_feature_importance(train_pool)
feature_names = X_trainc.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Primary Winner: 39.29902431385266
Total Party Votes: 34.00465299623439
STATE: 9.424936723126622
Primary Loser Vote: 6.230191089607111
YEAR: 5.535856997037768
Candidate Count: 2.799400385046335
PARTY: 1.1889688254486874
(I): 1.115670423411907
Party In House: 0.4012982462345353


In [174]:
Cat.eval_metrics(data = test_pool, metrics =['RMSE'], plot = True)

{'RMSE': [1479183.9496686796,
  1469548.2197407526,
  1461288.9297488981,
  1454962.0548141643,
  1447695.4210186359,
  1438172.404006072,
  1430640.385852376,
  1421532.867367484,
  1411911.4197335658,
  1404422.4097798879,
  1397504.9923403594,
  1388477.6094961783,
  1380168.3870066097,
  1372068.236575726,
  1363878.3016252744,
  1357584.6034325196,
  1349481.817424122,
  1341806.332491427,
  1333233.7597687228,
  1325263.5537552128,
  1317366.8961867993,
  1308935.859959126,
  1301667.2869278516,
  1295563.5265787311,
  1288031.6190931275,
  1281342.6108790352,
  1274301.606692177,
  1266658.1073281202,
  1259544.76488734,
  1252855.6929719327,
  1247002.4041735192,
  1240860.2465065476,
  1234167.5702961711,
  1227850.407033543,
  1221573.7857495688,
  1214724.7488982654,
  1208053.8104224186,
  1201309.9735693913,
  1194151.8467644332,
  1187242.2445590617,
  1180075.309362715,
  1173594.468210685,
  1168574.3650027036,
  1161879.818404147,
  1156364.56836729,
  1151120.71873140

In [175]:
Cat.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

{'R2': [0.005726788015900652,
  0.010353056874807809,
  0.014571136369901483,
  0.01842184938338609,
  0.023280851402255776,
  0.029194588624103623,
  0.033548696993303095,
  0.03814562870537175,
  0.04441323756198334,
  0.04866133331220157,
  0.052873619423950524,
  0.05779438113350854,
  0.062458283531345415,
  0.06753598271228611,
  0.07296208718875041,
  0.07739445435516024,
  0.0822550017839021,
  0.08646426463448009,
  0.0915813589681782,
  0.0965397362742787,
  0.1013535056517948,
  0.10705433677910725,
  0.1118523269969397,
  0.11613437863379672,
  0.12107435799966282,
  0.12519029047220254,
  0.13072317566545988,
  0.13621957669289753,
  0.14125483911854775,
  0.14535688669327307,
  0.14976161069421567,
  0.15418211970293938,
  0.15859504328139895,
  0.16362160873550258,
  0.1688126558006422,
  0.17391810679695474,
  0.1781235202017809,
  0.18315227401525047,
  0.18911098755185984,
  0.19405181431979557,
  0.19972744054624014,
  0.20483627807110827,
  0.20860480219449895,
  0.

In [176]:
Cat.score(X_testc,y_testc)

483101.7167516251

#### Catboost (Testing changing inputs)

**...Don't think it makes a difference because full list just crunches down unecessary inputs.**

In [None]:
hm_model_ready2.columns

In [None]:
ycat2 = hm_model_ready2['GENERAL VOTES'] 
Xcat2 = hm_model_ready2[['Primary Winner','Primary Loser Vote','STATE','PARTY']]

In [None]:
X_trainc2, X_testc2, y_trainc2, y_testc2 = train_test_split(Xcat2, ycat2, test_size=0.2, random_state=42)

In [None]:
categorical_features_indices = np.where(X_trainc2.dtypes != np.float)[0]
train_pool = Pool(X_trainc2, y_trainc2, cat_features=categorical_features_indices)
test_pool = Pool(X_testc2, y_testc2, cat_features=categorical_features_indices)


Cat = Catset.fit(X_trainc2, y_trainc2, cat_features=categorical_features_indices);

In [None]:
Cat.score(X_testc2, y_testc2)

In [None]:
Cat.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

#### Linear Regression

In [None]:
sns.pairplot(hm_model_ready, size = 1.2, aspect=1.5)

In [None]:
ylr = hm_model_ready['GENERAL VOTES'] 
Xlr = hm_model_ready[['Primary Winner','Primary Loser Vote','Party In House','(I)']]

In [None]:
Xlr

In [None]:
X_trainlr, X_testlr, y_trainlr, y_testlr = train_test_split(Xlr, ylr, test_size=0.3, random_state=42)

In [None]:
#Checking the predictability of the model with this alpha = 1
EN = ElasticNet(1, l1_ratio=.5)
LR = EN.fit(X_trainlr, y_trainlr)
rsq = LR.score(X_testlr, y_testlr)
adj_rsq = 1 - (1-rsq)*(len(ylr)-1)/(len(ylr)-Xlr.shape[1]-1)

preds = LR.predict(X_testlr)
rmse = np.sqrt(MSE(preds, y_testlr))

print(rsq)
print(adj_rsq)
print(rmse)


#### Linear Regression (Logged)

In [None]:
hm_model_log = hm_model_ready

hm_model_log['Primary Winner']=np.log(hm_model_log['Primary Winner'])
hm_model_log['Primary Loser Vote']=np.log(hm_model_log['Primary Loser Vote'])
hm_model_log['Third Party Votes']=np.log(hm_model_log['Third Party Votes'])
hm_model_log['GENERAL VOTES']=np.log(hm_model_log['GENERAL VOTES'])

In [None]:
ylr2 = hm_model_log['GENERAL VOTES'] 
Xlr2 = hm_model_log[['Primary Winner','Primary Loser Vote','Third Party Votes','Party In House','(I)']]

In [None]:
ylr2

In [None]:
Xlr2 = Xlr2.replace([np.inf, -np.inf], 0).fillna(0)


In [None]:
X_trainlr2, X_testlr2, y_trainlr2, y_testlr2 = train_test_split(Xlr2, ylr2, test_size=0.2, random_state=42)

In [None]:
#Checking the predictability of the model with this alpha = 1
EN = ElasticNet(1, l1_ratio=.5)
LR = EN.fit(X_trainlr2, y_trainlr2)
rsq = LR.score(X_testlr2, y_testlr2)
adj_rsq = 1 - (1-rsq)*(len(ylr2)-1)/(len(ylr2)-Xlr2.shape[1]-1)

preds = LR.predict(X_testlr2)
rmse = np.sqrt(MSE(preds, y_testlr2))

print(rsq)
print(adj_rsq)
print(rmse)

### Modeling (Splitting by state-size)

*Didn't end up helping much. Let's just go back to the one with the full train set*

**Grouped states by electoral college vote:**<br>

**Smallest:** <br>
- *3 Electoral votes:* Alaska, Delaware, Montana, North Dakota, South Dakota, Vermont, Wyoming
- *4 Electoral votes:* Hawaii, Idaho, Maine, New Hampshire, Rhode Island
- *5 Electoral votes:* Nebraska, New Mexico, West Virginia

**Medium:** <br>
- *6 Electoral votes:* Arkansas, Iowa ,Kansas, Mississippi, Nevada, Utah
- *7 Electoral votes:* Connecticut, Oklahoma, Oregon
- *8 Electoral votes:* Kentucky, Louisiana
- *9 Electoral votes:* Alabama, Colorado, South Carolina
- *10 Electoral votes:* Maryland, Minnesota, Missouri, Wisconsin
- *11 Electoral votes:* Arizona, Indiana, Massachusetts, Tennessee

**Medium-Large:** <br>
- *12 Electoral votes:* Washington 
- *13 Electoral votes:* Virginia
- *14 Electoral votes:* New Jersey
- *15 Electoral votes:* North Carolina
- *16 Electoral votes:* Georgia, Michigan
- *18 Electoral votes:* Ohio

**Large:** <br>
- *20 Electoral votes:* Illinois, Pennsylvania
- *29 Electoral votes:* Florida, New York
- *38 Electoral votes:* Texas
- *55 Electoral votes:* California


#### Small States

In [None]:
low_density = ['Alaska', 'Delaware', 'Montana', 'North Dakota', 'South Dakota', 'Vermont', 'Wyoming', 'Hawaii', 'Idaho', 'Maine', 'New Hampshire', 'Rhode Island', 'Nebraska', 'New Mexico', 'West Virginia']

small = hm_model_ready2[hm_model_ready2['STATE'].isin(low_density)].drop('Third Party Votes',axis = 1)

In [None]:
# small = small.drop(small[small['Primary Loser Vote'] < 1].index)

In [None]:
small.describe()

**Splitting X&y**

In [None]:
ysmall = small['GENERAL VOTES'] 
Xsmall = small.drop('GENERAL VOTES',axis = 1)

**Splitting for T-T-S**

In [None]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(Xsmall, ysmall, test_size=0.3)

**Running Catboost**

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_s.dtypes != np.float)[0]
train_pool = Pool(X_train_s, y_train_s, cat_features=categorical_features_indices)
test_pool = Pool(X_test_s, y_test_s, cat_features=categorical_features_indices)


Cat_s = Catset.fit(X_train_s, y_train_s, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
#Would like to get it down to less than 20,000...
Cat_s.score(X_test_s, y_test_s)

In [None]:
## Checking out feature importance for this model. 

feature_importances = Cat_s.get_feature_importance(train_pool)
feature_names = X_train_s.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

In [None]:
Cat_s.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

#### Medium States

In [None]:
med_density = ['Arkansas', 'Iowa', 'Kansas', 'Mississippi', 'Nevada', 'Utah', 'Connecticut', 'Oklahoma', 'Oregon','Kentucky', 'Louisiana','Alabama', 'Colorado', 'South Carolina', 'Maryland', 'Minnesota','Missouri', 'Wisconsin','Arizona','Indiana', 'Massachusetts', 'Tennessee']

med = hm_model_ready2[hm_model_ready2['STATE'].isin(med_density)].drop('Third Party Votes',axis = 1)

In [None]:
med.describe()

In [None]:
ymed = med['GENERAL VOTES'] 
Xmed = med.drop('GENERAL VOTES',axis = 1)

In [None]:
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(Xmed, ymed, test_size=0.3)

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_m.dtypes != np.float)[0]
train_pool = Pool(X_train_m, y_train_m, cat_features=categorical_features_indices)
test_pool = Pool(X_test_m, y_test_m, cat_features=categorical_features_indices)

Cat_m = Catset.fit(X_train_m, y_train_m, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
#Would like to get it down to less than 20,000...
Cat_m.score(X_test_m, y_test_m)

#### Medium-Large State

In [None]:
medlar_density = ['Washington','Virginia','New Jersey','North Carolina','Georgia','Michigan','Ohio']

medlar = hm_model_ready2[hm_model_ready2['STATE'].isin(medlar_density)].drop('Third Party Votes',axis = 1)

In [None]:
medlar.describe()

In [None]:
ymedlar = medlar['GENERAL VOTES'] 
Xmedlar = medlar.drop('GENERAL VOTES',axis = 1)

In [None]:
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(Xmedlar, ymedlar, test_size=0.3)

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_m.dtypes != np.float)[0]
train_pool = Pool(X_train_ml, y_train_ml, cat_features=categorical_features_indices)
test_pool = Pool(X_test_m, y_test_m, cat_features=categorical_features_indices)

Cat_ml = Catset.fit(X_train_ml, y_train_ml, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
Cat_ml.score(X_test_ml, y_test_ml)

In [None]:
Cat_ml.eval_metrics(data = test_pool, metrics =['R2'], plot = True)

### Breaking down by average voter turnout of the past 10 years

In [None]:
voter_turnout = hm_model_ready2.groupby(['STATE']).mean().reset_index()
voter_turnout = voter_turnout.sort_values(by=['GENERAL VOTES'])

In [None]:
voter_turnout

In [None]:
plt.barh(voter_turnout['STATE'],voter_turnout['GENERAL VOTES'], color = '#1A62A5')
plt.ylabel('State', fontsize=14)
plt.xlabel('General Election Votes', fontsize=14)
plt.title('Voter Turnout Per State', fontsize=20)

**Grouped states by voter turnout from the past ten years:**<br>

**Smallest:** <br>
- Wyoming, Alaska, Vermont, Delaware, South Dakota, North Dakota, Hawaii, Rhode Island, 

**Medium:** <br>
- *6 Electoral votes:* Arkansas, Iowa ,Kansas, Mississippi, Nevada, Utah
- *7 Electoral votes:* Connecticut, Oklahoma, Oregon
- *8 Electoral votes:* Kentucky, Louisiana
- *9 Electoral votes:* Alabama, Colorado, South Carolina
- *10 Electoral votes:* Maryland, Minnesota, Missouri, Wisconsin


In [None]:
low_turnout = ['Wyoming', 'Alaska', 'Vermont', 'Delaware', 'South Dakota', 'North Dakota', 'Hawaii', 'Rhode Island']

low = hm_model_ready2[hm_model_ready2['STATE'].isin(low_turnout)].drop('Third Party Votes',axis = 1)

In [None]:
low.describe()

**Splitting X&y**

In [None]:
ylow = low['GENERAL VOTES'] 
Xlow = low.drop('GENERAL VOTES',axis = 1)

**T-T-S**

In [None]:
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(Xlow, ylow, test_size=0.3)

In [None]:
from catboost import CatBoostRegressor, Pool

categorical_features_indices = np.where(X_train_l.dtypes != np.float)[0]
train_pool = Pool(X_train_l, y_train_l, cat_features=categorical_features_indices)

Cat_l = Catset.fit(X_train_l, y_train_l, cat_features=categorical_features_indices , use_best_model=True);

In [None]:
#Would like to get it down to less than 20,000...
Cat_l.score(X_test_l, y_test_l)

### Pipeline for Prediction

In [None]:
# submission = pd.DataFrame()
# submission['Item_Identifier'] = test['Item_Identifier']
# submission['Outlet_Identifier'] = test['Outlet_Identifier']
# submission['Item_Outlet_Sales'] = model.predict(test)
# submission.to_csv("Submission.csv")

In [177]:
# Used in prediction

hm_model_ready2

Unnamed: 0,GENERAL VOTES,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House
0,1335104.0,778851.0,5.0,1.0,505586.0,Alabama,2016,R,273265.0,0.0
1,748709.0,274423.0,2.0,0.0,153897.0,Alabama,2016,D,120526.0,1.0
2,138149.0,55293.0,4.0,1.0,39545.0,Alaska,2016,R,15748.0,0.0
3,36200.0,25318.0,2.0,0.0,15228.0,Alaska,2016,D,10090.0,1.0
4,1359267.0,591155.0,4.0,1.0,302532.0,Arizona,2016,R,288623.0,0.0
5,1031245.0,333586.0,1.0,0.0,333586.0,Arizona,2016,D,0.0,1.0
6,661984.0,389834.0,2.0,1.0,298039.0,Arkansas,2016,R,91795.0,0.0
8,7542759.0,4811300.0,7.0,0.0,3000689.0,California,2016,D,1810611.0,1.0
9,4701417.0,4811300.0,7.0,0.0,3000689.0,California,2016,D,1810611.0,1.0
10,1370710.0,262344.0,1.0,1.0,262344.0,Colorado,2016,D,0.0,1.0


In [178]:
Cat.predict([['10000', '3', '1', 7000, 'Alaska','2014', 'R',3000,1]])

array([93024.73064843])

In [179]:
#Read in CSV
Primaries_2018 = pd.read_csv('Primary_Results_Predicting.csv')

In [180]:
#Assigning a year
Primaries_2018['YEAR'] = '2018'

In [181]:
#IDs to merge on later
Primaries_2018["ID"] = Primaries_2018["State"].map(str) + Primaries_2018["YEAR"] +Primaries_2018["Party"]

In [182]:
Primaries_party = Primaries_2018[Primaries_2018['Party'].isin(['R','D'])]

In [183]:
Primaries_party

Unnamed: 0,Date,State,Candidate Name,Number of Votes,Percentage,(I),Party,Unnamed: 7,Unnamed: 8,YEAR,ID
0,6-Mar,Texas,Beto O'Rourke,641324.0,61.80%,0,D,,,2018,Texas2018D
1,6-Mar,Texas,Sema Hernandez,246308.0,23.7,0,D,,,2018,Texas2018D
2,6-Mar,Texas,Edward Kimbrough,150147.0,14.5,0,D,,,2018,Texas2018D
3,6-Mar,Texas,Ted Cruz*,1317450.0,85.30%,0,R,,,2018,Texas2018R
4,6-Mar,Texas,Mary Miller,94451.0,6.1,0,R,,,2018,Texas2018R
5,6-Mar,Texas,Bruce Jacobson,64604.0,4.2,0,R,,,2018,Texas2018R
6,6-Mar,Texas,Stefano de Stefano,44327.0,2.9,0,R,,,2018,Texas2018R
7,6-Mar,Texas,Geraldine Sam,22842.0,1.5,0,R,,,2018,Texas2018R
8,8-May,Indiana,Joe Donelly,0.0,0,1,D,,,2018,Indiana2018D
9,8-May,Indiana,Mike Braun,208505.0,41.20%,0,R,,,2018,Indiana2018R


** Totaling the Number of votes in primary **

In [184]:
primaries_partytotal = Primaries_party[['ID','Number of Votes']]

primaries_partytotal = primaries_partytotal.dropna()

primaries_partytotal["Number of Votes"]= primaries_partytotal["Number of Votes"].astype(float)

primaries_partytotal = primaries_partytotal.groupby(['ID']).sum().reset_index()

primaries_partytotal = primaries_partytotal.rename(columns = {'Number of Votes':'Total Party Votes'})

In [185]:
primaries_partytotal

Unnamed: 0,ID,Total Party Votes
0,California2018D,2876479.0
1,California2018R,1573388.0
2,Indiana2018D,0.0
3,Indiana2018R,506492.0
4,Maine2018D,0.0
5,Maine2018R,0.0
6,Maryland2018D,560477.0
7,Maryland2018R,169047.0
8,Mississippi2018D,85709.0
9,Mississippi2018R,155022.0


** Counting the Number of Candidates **

In [186]:
NumRunning_p = Primaries_party.groupby(['ID']).count().reset_index()

NumRunning_p = NumRunning_p[['ID','Number of Votes']]

NumRunning_p = NumRunning_p.rename(columns = {'Number of Votes':'Candidate Count'})

In [187]:
NumRunning_p

Unnamed: 0,ID,Candidate Count
0,California2018D,10
1,California2018R,11
2,Indiana2018D,1
3,Indiana2018R,3
4,Maine2018D,1
5,Maine2018R,1
6,Maryland2018D,8
7,Maryland2018R,11
8,Mississippi2018D,6
9,Mississippi2018R,2


** Inumbent Log **

In [188]:
Incumbents = Primaries_party.groupby(['ID']).sum().reset_index()

In [189]:
Incumbents = Incumbents[['ID','(I)']]

** Max votes per party per state **

In [190]:
PartyWin_p = Primaries_party[['ID','Number of Votes','State','YEAR','Party']]

PartyWin_p = PartyWin_p.dropna()

PartyWin_p["Number of Votes"]= PartyWin_p["Number of Votes"].astype(float)

PartyWin_p = PartyWin_p.groupby(['ID']).max().reset_index()

PartyWin_p = PartyWin_p.rename(columns = {'Number of Votes':'Primary Winner'})

In [191]:
PartyWin_p

Unnamed: 0,ID,Primary Winner,State,YEAR,Party
0,California2018D,2031967.0,California,2018,D
1,California2018R,398477.0,California,2018,R
2,Indiana2018D,0.0,Indiana,2018,D
3,Indiana2018R,208505.0,Indiana,2018,R
4,Maine2018D,0.0,Maine,2018,D
5,Maine2018R,0.0,Maine,2018,R
6,Maryland2018D,450890.0,Maryland,2018,D
7,Maryland2018R,49428.0,Maryland,2018,R
8,Mississippi2018D,27358.0,Mississippi,2018,D
9,Mississippi2018R,128309.0,Mississippi,2018,R


** Merging columns back together **

In [192]:
votes_merged_p = primaries_partytotal.merge(NumRunning_p, left_on='ID', right_on='ID', how='left')

votes_merged_p = votes_merged_p.merge(PartyWin_p, left_on='ID', right_on='ID', how = 'left')

votes_merged_p = votes_merged_p.merge(Incumbents, left_on='ID', right_on='ID', how ='left')

In [193]:
votes_merged_p

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I)
0,California2018D,2876479.0,10,2031967.0,California,2018,D,1
1,California2018R,1573388.0,11,398477.0,California,2018,R,0
2,Indiana2018D,0.0,1,0.0,Indiana,2018,D,1
3,Indiana2018R,506492.0,3,208505.0,Indiana,2018,R,0
4,Maine2018D,0.0,1,0.0,Maine,2018,D,0
5,Maine2018R,0.0,1,0.0,Maine,2018,R,0
6,Maryland2018D,560477.0,8,450890.0,Maryland,2018,D,1
7,Maryland2018R,169047.0,11,49428.0,Maryland,2018,R,0
8,Mississippi2018D,85709.0,6,27358.0,Mississippi,2018,D,0
9,Mississippi2018R,155022.0,2,128309.0,Mississippi,2018,R,1


** Counting total votes by loser **

In [194]:
votes_merged_p['Primary Loser Vote'] = votes_merged_p['Total Party Votes'] - votes_merged_p['Primary Winner']

In [195]:
votes_merged_p

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I),Primary Loser Vote
0,California2018D,2876479.0,10,2031967.0,California,2018,D,1,844512.0
1,California2018R,1573388.0,11,398477.0,California,2018,R,0,1174911.0
2,Indiana2018D,0.0,1,0.0,Indiana,2018,D,1,0.0
3,Indiana2018R,506492.0,3,208505.0,Indiana,2018,R,0,297987.0
4,Maine2018D,0.0,1,0.0,Maine,2018,D,0,0.0
5,Maine2018R,0.0,1,0.0,Maine,2018,R,0,0.0
6,Maryland2018D,560477.0,8,450890.0,Maryland,2018,D,1,109587.0
7,Maryland2018R,169047.0,11,49428.0,Maryland,2018,R,0,119619.0
8,Mississippi2018D,85709.0,6,27358.0,Mississippi,2018,D,0,58351.0
9,Mississippi2018R,155022.0,2,128309.0,Mississippi,2018,R,1,26713.0


** Adding if party is in the white house **

In [196]:
OP2018 = []

for party in votes_merged_p['Party']:
    if party == 'R':
        OP2018.append(1)
        
    else:
        OP2018.append(0)
    
votes_merged_p['OFFICE PARTY'] = OP2018

** Seperating out Unopposed **

In [197]:
Unopp2018 = votes_merged_p.drop(votes_merged_p[votes_merged_p['Primary Winner'] > 1].index)

In [198]:
Unopp2018

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I),Primary Loser Vote,OFFICE PARTY
2,Indiana2018D,0.0,1,0.0,Indiana,2018,D,1,0.0,0
4,Maine2018D,0.0,1,0.0,Maine,2018,D,0,0.0,0
5,Maine2018R,0.0,1,0.0,Maine,2018,R,0,0.0,1
10,Montana2018D,0.0,1,0.0,Montana,2018,D,1,0.0,0
18,New Mexico2018D,0.0,1,0.0,New Mexico,2018,D,1,0.0,0
19,New Mexico2018R,0.0,1,0.0,New Mexico,2018,R,0,0.0,1
20,New York2018D,0.0,1,0.0,New York,2018,D,1,0.0,0
21,New York2018R,0.0,1,0.0,New York,2018,R,0,0.0,1
22,North Dakota2018D,0.0,1,0.0,North Dakota,2018,D,1,0.0,0
24,Ohio2018D,0.0,1,0.0,Ohio,2018,D,1,0.0,0


** The races we can do estimates for **

In [199]:
fin2018 = votes_merged_p.drop(votes_merged_p[votes_merged_p['Primary Winner'] < 1].index)

In [200]:
fin2018

Unnamed: 0,ID,Total Party Votes,Candidate Count,Primary Winner,State,YEAR,Party,(I),Primary Loser Vote,OFFICE PARTY
0,California2018D,2876479.0,10,2031967.0,California,2018,D,1,844512.0,0
1,California2018R,1573388.0,11,398477.0,California,2018,R,0,1174911.0,1
3,Indiana2018R,506492.0,3,208505.0,Indiana,2018,R,0,297987.0,1
6,Maryland2018D,560477.0,8,450890.0,Maryland,2018,D,1,109587.0,0
7,Maryland2018R,169047.0,11,49428.0,Maryland,2018,R,0,119619.0,1
8,Mississippi2018D,85709.0,6,27358.0,Mississippi,2018,D,0,58351.0,0
9,Mississippi2018R,155022.0,2,128309.0,Mississippi,2018,R,1,26713.0,1
11,Montana2018R,152483.0,4,51549.0,Montana,2018,R,0,100934.0,1
12,Nebraska2018D,90480.0,4,57654.0,Nebraska,2018,D,0,32826.0,0
13,Nebraska2018R,167645.0,5,127083.0,Nebraska,2018,R,1,40562.0,1


** Running those estimates!!! ** <3

In [201]:
fin2018 = fin2018.rename(index=str, columns={"State": "STATE", "Party": "PARTY","OFFICE PARTY":'Party In House'}).drop("ID",axis=1)

# fin2018['Total Party Votes'] = fin2018['Total Party Votes']
# fin2018['Candidate Count'] = fin2018['Candidate Count']
# fin2018['(I)'] = fin2018['(I)']
# fin2018['Primary Winner'] = fin2018['Primary Winner']
# fin2018['STATE'] = fin2018['State']
# fin2018['YEAR'] = fin2018['YEAR']
# fin2018['PARTY'] = fin2018['Party']
# fin2018['Primary Loser Vote'] = fin2018['Primary Loser Vote']
# fin2018['Party In House'] = fin2018['OFFICE PARTY']

# submission.to_csv("Submission.csv")


# fin2018['Predicted Turnout'] = Cat.predict(fin2018)
# submission.to_csv("Submission.csv")
# Cat.predict([['10000', '3', '1', 7000, 'Alaska','2014', 'R',3000,1]])

In [202]:
fin2018

Unnamed: 0,Total Party Votes,Candidate Count,Primary Winner,STATE,YEAR,PARTY,(I),Primary Loser Vote,Party In House
0,2876479.0,10,2031967.0,California,2018,D,1,844512.0,0
1,1573388.0,11,398477.0,California,2018,R,0,1174911.0,1
3,506492.0,3,208505.0,Indiana,2018,R,0,297987.0,1
6,560477.0,8,450890.0,Maryland,2018,D,1,109587.0,0
7,169047.0,11,49428.0,Maryland,2018,R,0,119619.0,1
8,85709.0,6,27358.0,Mississippi,2018,D,0,58351.0,0
9,155022.0,2,128309.0,Mississippi,2018,R,1,26713.0,1
11,152483.0,4,51549.0,Montana,2018,R,0,100934.0,1
12,90480.0,4,57654.0,Nebraska,2018,D,0,32826.0,0
13,167645.0,5,127083.0,Nebraska,2018,R,1,40562.0,1


In [203]:

# GENERAL VOTES	Total Party Votes	Candidate Count	(I)	Primary Winner	STATE	YEAR	PARTY	Primary Loser Vote	Party In House

In [204]:
fin2018 = fin2018[['Total Party Votes', 'Candidate Count', '(I)', 'Primary Winner', 'STATE', 'YEAR','PARTY','Primary Loser Vote','Party In House']]

In [205]:
### WE have to re-order so the columns are structured the same way as the input. 
fin2018['Predicted Turnout'] = Cat.predict(fin2018)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [206]:
fin2018

Unnamed: 0,Total Party Votes,Candidate Count,(I),Primary Winner,STATE,YEAR,PARTY,Primary Loser Vote,Party In House,Predicted Turnout
0,2876479.0,10,1,2031967.0,California,2018,D,844512.0,0,4588160.0
1,1573388.0,11,0,398477.0,California,2018,R,1174911.0,1,2513807.0
3,506492.0,3,0,208505.0,Indiana,2018,R,297987.0,1,1368175.0
6,560477.0,8,1,450890.0,Maryland,2018,D,109587.0,0,1279270.0
7,169047.0,11,0,49428.0,Maryland,2018,R,119619.0,1,580833.4
8,85709.0,6,0,27358.0,Mississippi,2018,D,58351.0,0,328319.4
9,155022.0,2,1,128309.0,Mississippi,2018,R,26713.0,1,561494.4
11,152483.0,4,0,51549.0,Montana,2018,R,100934.0,1,408386.8
12,90480.0,4,0,57654.0,Nebraska,2018,D,32826.0,0,383631.6
13,167645.0,5,1,127083.0,Nebraska,2018,R,40562.0,1,512787.1


In [207]:
fin2018.to_csv('Fin2018.csv')

** Choosing which states to focus on!!! **

States that are most contentious: 
- Indiana (unopposed D...)
- Mississippi (Seems likely R!)
- Montana (unopposed D...)
- Nebraska (Seems likely R!)
- Nevada (Gonna be a close one!!!)
- New Jersey (Seems likely D!)
- North Dakota (unopposed D...)
- Ohio (unopposed D...)
- Pennsylvania (unopposed D...)
- Texas (Seems likely R!...)
- Virginia (unopposed D...)
- West Virginia (Gonna be a close one!!!)

**Maybe I look at how much they gather last election?**