In [238]:
import numpy as np
import scipy
from scipy.stats import bernoulli
from scipy import stats
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [239]:
elec_2010 = pd.read_csv("Agg_2010.csv")
elec_2012 = pd.read_csv("Agg_2012.csv")
elec_2016 = pd.read_csv("Agg_2016.csv")
elec_final = pd.read_csv("Final_Aggregated.csv")

In [256]:
elec_final.columns

Index(['Unnamed: 0', 'index', 'Cand_Id', 'Cand_Name', 'Cand_Office_Dist',
       'Cand_Office_St', 'Cand_Party_Affiliation', 'D', 'DISTRICT', 'FEC ID#',
       'Gender', 'Incumbent', 'Median_Income', 'Party_Previous_Vote_Share',
       'Presidential_Approval_Rating', 'Primary.Vote.Percent', 'Raised',
       'STATE', 'Same_Party_As_President', 'Spent', 'Unnamed: 0.1', 'Year',
       'gov_party', 'labor_force', 'minority_percentage', 'name_commonness',
       'percent_bachelor_or_above', 'recent_presidential_vote',
       'seat_transition'],
      dtype='object')

In [257]:
X = elec_final[['Incumbent', 'Median_Income', 'Presidential_Approval_Rating', 'Primary.Vote.Percent', 'Raised', 'Same_Party_As_President', 'Spent', 'Year', 'gov_party', 'labor_force', 'minority_percentage', 'name_commonness', 'percent_bachelor_or_above', 'recent_presidential_vote']]
X['gov_party'] = (X['gov_party'] == 'DEM').astype('int')

# coding democrats as 1 and republicans as 0
Y = (elec_final['Cand_Party_Affiliation']  == 'DEM').astype('int')

# removing american samoa and guam
X = X.drop(elec_final[elec_final['Cand_Office_St'].isin(['AS', 'GU'])].values[:,0])
Y = Y.drop(elec_final[elec_final['Cand_Office_St'].isin(['AS', 'GU'])].values[:,0])
X.replace('character(0)', 0, inplace = True)
X['labor_force'] = X['labor_force'].astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [258]:
X = X.values.astype('float64')
Y = Y.values.astype('float64')

In [259]:
# mean imputation on all columns
means = np.nanmean(X, axis = 0) 
for m in range(len(means)):
    X[np.isnan(X[:,m]), m] = means[m]

In [260]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2)

In [261]:
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)

In [262]:
clf.score(X_train, y_train)

0.5827686350435625

In [265]:
clf.score(X_test, y_test)

0.5945945945945946


In [267]:
clf2 = LogisticRegression(penalty = 'l1').fit(X_train, y_train)
print (clf2.score(X_train, y_train))
print (clf2.score(X_test, y_test))

1.0
1.0


## Simulation code for determining house control

In [None]:
# probs is an array of probabilities that the democratic candidate will win
def dems_take_house_indep(probs, nsims):
    sims = np.empty((nsims, probs.shape[0]))
      for i in range(probs.shape[0]):
        sims[:,i] = scipy.stats.bernoulli.rvs(probs[i], size = nsims)
    return np.sum(np.sum(sims, axis = 1) > probs.shape[0] / 2) / nsims

In [None]:
# https://twiecki.github.io/blog/2018/05/03/copulas/
def dems_take_house_corr(probs, corr, nsims):
    n = probs.shape[0]
    cov = np.full((n,n), corr)
    np.fill_diagonal(cov, 1)
    mvnorm = scipy.stats.multivariate_normal(mean=np.zeros(n), cov = cov)
    x = mvnorm.rvs(nsims)
    x_unif = stats.norm.cdf(x)
    sims = np.empty((nsims, probs.shape[0]))
    for i in range(probs.shape[0]):
        sims[:,i] = bernoulli.ppf(x_unif[:,i], probs[i])
    return np.sum(np.sum(sims, axis = 1) > probs.shape[0] / 2) / nsims
