In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [2]:
df_orig = pd.read_csv('anes_cdf_converted.csv')
df_orig = df_orig[df_orig.VCF0702.notnull()]

In [114]:
df = df_orig.drop(['Unnamed: 0', 'congressional_district','state'], axis = 1)
todrop = ['VCF0702','VCF0703','VCF0707','VCF0708','year']
df = df.iloc[:, ~df.columns.str.contains('VCF0734')]
df = df.iloc[:, ~df.columns.str.contains('VCF0736')]
df = df.iloc[:, ~df.columns.str.contains('VCF1011')]
df = df.iloc[:, ~df.columns.str.contains('VCF0704')]
df = df.iloc[:, ~df.columns.str.contains('VCF0710')]
df = df.iloc[:, ~df.columns.str.contains('VCF0709')]
df = df.iloc[:, ~df.columns.str.contains('_oh0')]
df = df.loc[:, ~df.columns.str.contains('_dk') & (np.sum(df) > 150)] #best 150
#df = df.iloc[:, ~df.columns.str.contains('VCF0713')]

df_train = df[df.year < 2012]
df_test = df[df.year == 2012]

X_train = df_train.drop(todrop, axis = 1)
X_test = df_test.drop(todrop, axis = 1)
y_train = df_train.VCF0702
y_train = y_train.apply(lambda x: 0 if x==1 else 1)
y_test = df_test.VCF0702
y_test = y_test.apply(lambda x: 0 if x==1 else 1)

columns = X_train.columns

In [115]:
imp = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
scale = StandardScaler()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [116]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC

#model = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.1)
model = LogisticRegression(C = 4, penalty = 'l2')
#model = RandomForestClassifier(n_estimators = 1000, max_depth = 15)
#model = GaussianNB()
#model = BernoulliNB(alpha = 1)
#model = SVC(C = .8, gamma = .0005)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print('Accuracy training: ', model.score(X_train, y_train))
print('F1-score training: ', f1_score(y_train, y_pred_train))
print('Confusion Matrix training:\n', confusion_matrix(y_train, y_pred_train))
print('Accuracy test: ', model.score(X_test, y_test))
print('F1-score test : ', f1_score(y_test, y_pred))
print('Confusion Matrix test:\n', confusion_matrix(y_test, y_pred))

Accuracy training:  0.883170355121
F1-score training:  0.713021491783
Confusion Matrix training:
 [[2868  115]
 [ 339  564]]
Accuracy test:  0.838739067055
F1-score test :  0.604026845638
Confusion Matrix test:
 [[3928  454]
 [ 431  675]]


In [117]:
coefficients = pd.DataFrame(pd.concat([pd.DataFrame(model.coef_.T), pd.DataFrame(columns)], axis = 1))
coefficients.columns = ['coefficient','feature']
coefficients.coefficient = coefficients.coefficient.apply(lambda x: abs(x))
coefficients.sort_values(by = 'coefficient', ascending = False)

Unnamed: 0,coefficient,feature
46,1.046273,VCF9030a
179,0.856561,VCF0107_oh5
2,0.848708,VCF0108
146,0.76629,VCF0140a
211,0.751199,VCF0713_oh4
193,0.699473,VCF0147_oh1
72,0.610554,VCF0291
47,0.59664,VCF9030b
150,0.583424,VCF0301
174,0.561685,VCF0105a_oh2


In [263]:
df = df_orig.drop(['Unnamed: 0', 'congressional_district','state'], axis = 1)
y = df.VCF0702

todrop = ['VCF0702','VCF0703','VCF0707','VCF0708','year']
df = df.iloc[:, ~df.columns.str.contains('VCF0734')]
df = df.iloc[:, ~df.columns.str.contains('VCF0736')]
df = df.iloc[:, ~df.columns.str.contains('VCF1011')]
df = df.iloc[:, ~df.columns.str.contains('VCF0704')]
df = df.iloc[:, ~df.columns.str.contains('VCF0710')]
df = df.iloc[:, ~df.columns.str.contains('VCF0709')]
#df = df.iloc[:, ~df.columns.str.contains('_oh0')]
#df = df.loc[:, ~df.columns.str.contains('_dk') & (np.sum(df) > 150)] #best 150
X = df.drop(todrop, axis = 1)

from sklearn.decomposition import PCA
pca = PCA(n_components=150, svd_solver='full')
imp = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
scale = StandardScaler()

y = y.apply(lambda x: 0 if x==1 else 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.65, stratify = y)

X_train = imp.fit_transform(X_train)
X_train = scale.fit_transform(X_train)
X_train = pca.fit_transform(X_train)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

X_test = imp.transform(X_test)
X_test = scale.transform(X_test)
X_test = pca.transform(X_test)

[ 0.08734806  0.04373314  0.02658681  0.02429797  0.02020982  0.01585782
  0.0151164   0.01377482  0.01186888  0.01074655  0.01053316  0.01003433
  0.00962791  0.00897676  0.00846313  0.00824902  0.00813022  0.0080025
  0.00782873  0.00757154  0.00731722  0.007194    0.00706955  0.00656545
  0.00637394  0.00628063  0.00617381  0.00603096  0.00579788  0.00578997
  0.00563221  0.00551881  0.00542203  0.0053919   0.00527199  0.00516785
  0.00508268  0.00502605  0.004918    0.00487298  0.00473508  0.00468594
  0.00465846  0.00460416  0.00455479  0.00448522  0.0044414   0.00440083
  0.00436711  0.00431021  0.00425089  0.00420262  0.00415974  0.00415143
  0.00405968  0.00403717  0.00399511  0.0039317   0.00391004  0.00386594
  0.00383954  0.00379706  0.0037493   0.00370001  0.00369354  0.00367343
  0.00366667  0.00361851  0.00356498  0.00356007  0.00350062  0.0034652
  0.00345586  0.00342199  0.00338352  0.00337258  0.00333527  0.00329269
  0.00326834  0.00324395  0.00322867  0.00321084  0.0

In [264]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC

#model = AdaBoostClassifier()
model = LogisticRegression(C = 1, penalty = 'l2')
#model = RandomForestClassifier()
#model = GaussianNB()
#model = BernoulliNB(alpha = 1)
#model = SVC(C = .8, gamma = .0005)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print('Accuracy training: ', model.score(X_train, y_train))
print('F1-score training: ', f1_score(y_train, y_pred_train))
print('Confusion Matrix training:\n', confusion_matrix(y_train, y_pred_train))
print('Accuracy test: ', model.score(X_test, y_test))
print('F1-score test : ', f1_score(y_test, y_pred))
print('Confusion Matrix test:\n', confusion_matrix(y_test, y_pred))

Accuracy training:  0.87743902439
F1-score training:  0.671568627451
Confusion Matrix training:
 [[2467  110]
 [ 292  411]]
Accuracy test:  0.855431572038
F1-score test :  0.614779186707
Confusion Matrix test:
 [[4510  278]
 [ 603  703]]
