## FEC 2018 Campaign Finance: Modeling

state

previously in politics

proportion individ contrib from in-state

party of prev officeholder 

primary results (is it open bc primary challenger beat incumbent)

force predict one winner per race

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from copy import deepcopy
from itertools import combinations 
from matplotlib import patheffects as path_effects
from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix,
)

%matplotlib inline

In [3]:
df_orig = pd.read_csv('data/04/df_cleaned_04b.csv')

# include only house elections
df_orig = df_orig[df_orig['position'] == 'H']
print(len(df_orig))

# exclude races where Third party won
df_orig = df_orig[df_orig['contest'] != 'MP_00']
print(len(df_orig))

# exclude Third Party candidates for modeling for now
df_orig = df_orig[df_orig['cand_pty_affiliation'] != 'Third party']
print(len(df_orig))

# exclude open seat races; model requires one incumbent and one challenger
df_orig = df_orig[df_orig['cand_ici'] != 'O']
print(len(df_orig))

# exclude solo candidates
solos = list(df_orig['contest'].value_counts()[df_orig['contest'].value_counts() == 1].index)
for which in solos:
    df_orig = df_orig[df_orig['contest'] != which]
print(len(df_orig))

# exclude races with more than one candidate from each party
drops = []
for race in list(df_orig['contest'].value_counts().index):
    lil_df = df_orig[df_orig['contest'] == race]
    orig = len(lil_df)
    lil_df.drop_duplicates(['cand_pty_affiliation'], inplace = True)
    new = len(lil_df)
    if orig != new:
        drops.append(race)
        print(race)
        
mask = [False if df_orig.loc[i, 'contest'] in drops else True for i in list(df_orig.index)]
df_orig = df_orig[mask]
print(len(df_orig))

# exclude races with more than one incumbent (i.e. gerrymandering cases)
drops = []
for race in list(df_orig['contest'].value_counts().index):
    lil_df = df_orig[df_orig['contest'] == race]
    orig = len(lil_df)
    lil_df.drop_duplicates(['cand_ici'], inplace = True)
    new = len(lil_df)
    if orig != new:
        drops.append(race)
        print(race)
        
mask = [False if df_orig.loc[i, 'contest'] in drops else True for i in list(df_orig.index)]
df_orig = df_orig[mask]
print(len(df_orig))
    
df_orig.head()

KeyError: 'position'

In [None]:
demog_df = pd.read_csv('data/07/demo_dict.csv')
demog_df.head(2)

In [None]:
df_orig['noncismale'] = [
    list(demog_df.loc[demog_df['cand_id'] == df_orig.loc[i, 'cand_id'], 'noncismale'].values)[0] \
    for i in list(df_orig.index)]

In [None]:
df = df_orig.drop([
    'cand_id',
    'cand_name',
    'cvg_end_dt',
    'cand_class', 
#     'contest', # keep for join later
    'cand_office_district',
    'cand_office_st',
    'ttl_disb',
    'coh_cop',
    'cand_loan_repay',
    'other_loan_repay',
    'indiv_refunds',
    'cmte_refunds',
    'trans_to_auth',
    'debts_owed_by', # include? or time machine issue
    'position',
], axis = 1)

if len(df.isnull().sum()[df.isnull().sum() > 0].sort_values()) == 0:
    print('no nulls')
else:
    print('check nulls')
    
df.head()

In [None]:
df = df[[x for x in df.columns if 'loan' not in x]]

In [None]:
dummify = [
    'cand_ici',
    'cand_pty_affiliation',
#     'position'
]

for col in dummify:
    df = pd.concat([
        df.drop([col], axis = 1), 
        pd.get_dummies(df[col])
    ], axis = 1)
    
df.rename(columns={
    'C' : 'challenger',
    'I' : 'incumbent',
    'O' : 'open seat',
#     'H' : 'House',
#     'S' : 'Senate',
}, inplace = True)

In [None]:
republicans = df[df['Republican'] == 1]
republicans.columns = [x + ' (R)' if x != 'contest' else x for x in republicans.columns]
republicans.set_index('contest', inplace = True)

democrats = df[df['Republican'] == 0]
democrats.columns = [x + ' (D)' if x != 'contest' else x for x in democrats.columns]
democrats.set_index('contest', inplace = True)

pre_dfs = republicans.join(democrats, on = 'contest')
pre_dfs.reset_index(drop = False, inplace = True)

pre_dfs.head(2)

In [None]:
pre_dfs['r:d_funding_ratio'] = pre_dfs['ttl_receipts (R)']/pre_dfs['ttl_receipts (D)']

In [None]:
# list(pre_dfs.columns)

In [None]:
contin_cols = [
    'ttl_receipts (R)',
    'trans_from_auth (R)',
    'coh_bop (R)',
    'cand_contrib (R)',
#     'cand_loans (R)',
#     'other_loans (R)',
    'ttl_indiv_contrib (R)',
    'other_pol_cmte_contrib (R)',
    'pol_pty_contrib (R)',
    'comm_ct (R)',
    'ttl_receipts (D)',
    'trans_from_auth (D)',
    'coh_bop (D)',
    'cand_contrib (D)',
#     'cand_loans (D)',
#     'other_loans (D)',
    'ttl_indiv_contrib (D)',
    'other_pol_cmte_contrib (D)',
    'pol_pty_contrib (D)',
    'comm_ct (D)',
    'r:d_funding_ratio',
    
]

contin = pre_dfs[contin_cols]

contin_s = pd.DataFrame(
    MinMaxScaler().fit_transform(contin), 
    index = pre_dfs.index, 
    columns = contin_cols,
)

the_rest = [x for x in pre_dfs.columns if x not in contin_s.columns]

dfs = pd.concat([contin_s, pre_dfs[the_rest]], axis = 1)

# dfs.columns = [x for x in pre_dfs.columns if x != 'contest']

# can use once missing values filled in
# dfs.drop([
#     'nonwhite (R)', 
#     'nonwhite (D)',
# ], axis = 1, inplace = True)

dfs.columns = [x.replace('Republican', 'republican').replace('Democrat', 'democrat') for x in dfs.columns]

dfs = dfs[sorted(dfs.columns)]

In [None]:
dfs.drop([
    'challenger (D)',
    'challenger (R)',
    'contest',
    'democrat (D)',
    'democrat (R)',
    'incumbent (D)',
    'maxfunding_flag (D)',
    'maxfunding_flag (R)',
    'republican (D)',
    'republican (R)',
    'winner_flag (D)',
    
], axis = 1, inplace = True)

In [None]:
# partisan donors -- only gave to one party
dfs.sum()[dfs.sum() == 0]

In [None]:
drops = dfs.sum()[dfs.sum() == 0].index

for which in drops:
    dfs.drop(which, axis = 1, inplace = True)

In [None]:
plt.figure(figsize = (12, 12))
corr_mat = np.tril(dfs.corr(), k = -1)
sns.heatmap(corr_mat, cmap = 'PRGn', vmin = -1, vmax = 1)
plt.title('2018 U.S. House of Representatives elections:\nFeature correlation', fontsize = 14)
plt.xticks(
    np.arange(0.5, len(dfs.columns), 1), 
    dfs.columns, 
    fontsize = 6, 
    rotation = 90
)
plt.yticks(
    np.arange(0.5, len(dfs.columns), 1), 
    dfs.columns, 
    fontsize = 6, 
    rotation = 0
)
plt.xlim([0, len(dfs.columns)])
plt.ylim([len(dfs.columns), 0])
plt.tight_layout()
plt.show();

In [None]:
X = deepcopy(dfs)
X.drop(['winner_flag (R)'], axis = 1, inplace = True)

# target is whether incumbent wins
y = dfs[['winner_flag (R)']]

X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X[y['winner_flag (R)'] == 1], 
    y[y['winner_flag (R)'] == 1], 
    random_state = 421, 
    test_size = 0.33
)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X[y['winner_flag (R)'] == 0], 
    y[y['winner_flag (R)'] == 0], 
    random_state = 421, 
    test_size = 0.33
)

X_train = pd.DataFrame()
X_test = pd.DataFrame()
y_train = pd.DataFrame()
y_test = pd.DataFrame()

all_four = [
    [X_train1, X_train2],
    [X_test1, X_test2],
    [y_train1, y_train2],
    [y_test1, y_test2],
]

for i in range(2):
    X_train = pd.concat([X_train, all_four[0][i]], axis = 0)
    X_test = pd.concat([X_test, all_four[1][i]], axis = 0)
    y_train = pd.concat([y_train, all_four[2][i]], axis = 0)
    y_test = pd.concat([y_test, all_four[3][i]], axis = 0)

In [None]:
def plot_confusion_matrix(
    confmat, 
    title = 'Confusion Matrix', 
    labels = ['Win', 'Loss'], 
    cmap = plt.cm.Blues,
):

    plt.figure(figsize = (10, 6))
    plt.imshow(
        confmat, 
        interpolation = 'nearest', 
        cmap = cmap, 
    )

    plt.grid(b = False)

    width, height = confmat.shape
    
    tick_marks = np.arange(width)
    plt.xticks(tick_marks, labels, fontsize = 14)
    plt.yticks(tick_marks, labels, fontsize = 14)
    plt.scatter(0, 0, color = (0,0,0,0))

    plt.title(title, fontsize = 14)
    plt.ylabel('Actual', fontsize = 14)
    plt.xlabel('Prediction', fontsize = 14)
    plt.tight_layout()

    for x in range(width):
        for y in range(height):
            plt.annotate(
                str(confmat[x][y]), 
                xy = (y, x), 
                horizontalalignment = 'center', 
                verticalalignment = 'center', 
                color = 'white', 
                fontsize = 40,
            ).set_path_effects(
                [
                    path_effects.Stroke(linewidth = 1, 
                                        foreground = 'black'
                                       ), 
                    path_effects.Normal()
                ]
            )
    return

### Naïve model

In [None]:
y_pred_inc = [1 if (dfs.loc[i, 'incumbent (R)'] == 1) else 0 for i in list(dfs.index)]

In [None]:
plot_confusion_matrix(
    confusion_matrix(dfs['winner_flag (R)'], y_pred_inc),
    title = '2018 U.S. House of Representatives elections:\n\
    Naïve model confusion matrix\n--> choose incumbent',
)

In [None]:
inc_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred_inc), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

inc_wrong = inc_y[inc_y['test'] != inc_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[inc_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

In [None]:
y_pred_fund = [1 if (pre_dfs.loc[i, 'r:d_funding_ratio'] > 1) else 0 for i in list(dfs.index)]

In [None]:
plot_confusion_matrix(
    confusion_matrix(dfs['winner_flag (R)'], y_pred_fund),
    title = '2018 U.S. House of Representatives elections:\n\
    Naïve model confusion matrix\n--> choose higher funding',
)

In [None]:
fund_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred_fund), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

fund_wrong = fund_y[fund_y['test'] != fund_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[fund_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

### Logistic regression

In [None]:
lr = LogisticRegressionCV(cv = 11, random_state = 421)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\n\
    Logistic regression confusion matrix',
)

In [None]:
lr_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

lr_wrong = lr_y[lr_y['test'] != lr_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[lr_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

In [None]:
coefs = pd.DataFrame(zip(X, lr.coef_[0]), columns = ['col', 'coef'])
coefs['abs_coef'] = [abs(x) for x in coefs['coef']]

coefs.sort_values(['abs_coef'], ascending = False, inplace = True)
coefs['color'] = (coefs['coef'] == coefs['abs_coef'])
coefs['color'] = ['green' if x == True else 'purple' for x in coefs['color']]
coefs.drop(['abs_coef'], axis = 1, inplace = True)
coefs = coefs[::-1]
coefs.reset_index(drop = True, inplace = True)

In [None]:
plt.figure(figsize = (12, 24))
plt.barh(
    coefs.index, 
    coefs['coef'].apply(lambda x: abs(x)), 
    color = coefs['color'], 
    alpha = 0.8
)

plt.title('2018 U.S. House of Representatives elections:\n\
Logistic regression feature importance', fontsize = 14)
plt.ylabel('Feature', fontsize = 12)
plt.yticks(range(len(coefs)), coefs['col'].values)
plt.xlabel('Coefficient', fontsize = 12)
# plt.xticks()
plt.tight_layout()
plt.show();

### Random forest

In [None]:
rfc = RandomForestClassifier(random_state=421)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
rfc.score(X_test, y_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\n\
    Random forest confusion matrix',
)

In [None]:
rfc_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

rfc_wrong = rfc_y[rfc_y['test'] != rfc_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[rfc_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

In [None]:
coefs = pd.DataFrame(zip(X, rfc.feature_importances_), columns = ['col', 'coef'])
coefs['abs_coef'] = [abs(x) for x in coefs['coef']]

coefs.sort_values(['abs_coef'], ascending = False, inplace = True)
coefs['color'] = (coefs['coef'] == coefs['abs_coef'])
coefs['color'] = ['green' if x == True else 'purple' for x in coefs['color']]
coefs.drop(['abs_coef'], axis = 1, inplace = True)
coefs = coefs[::-1]
coefs.reset_index(drop = True, inplace = True)

In [None]:
plt.figure(figsize = (8, 24))
plt.barh(
    coefs.index, 
    coefs['coef'].apply(lambda x: abs(x)), 
    color = coefs['color'], 
    alpha = 0.8
)

plt.title('2018 House of Representatives elections:\n\
Random forest feature importance', fontsize = 14)
plt.ylabel('Feature', fontsize = 12)
plt.yticks(range(len(coefs)), coefs['col'].values)
plt.xlabel('Importance', fontsize = 12)
plt.xticks(rotation = -45)
plt.tight_layout()
plt.show();

### Support vector machines

In [None]:
SVM = svm.LinearSVC(random_state = 421)
SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)
SVM.score(X_test,y_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\nSVM confusion matrix',
    labels = ['Win', 'Loss']
)

In [None]:
svm_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

svm_wrong = svm_y[svm_y['test'] != svm_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[svm_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

### Neural network

In [None]:
NN = MLPClassifier()

NN.fit(X_train, y_train)
y_pred = NN.predict(X_test)
NN.score(X_test, y_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\nNN confusion matrix',
    labels = ['Win', 'Loss']
)

In [None]:
nn_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

nn_wrong = nn_y[nn_y['test'] != nn_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[nn_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

## PCA

In [None]:
from sklearn.decomposition import PCA

dfspc = dfs.drop(['winner_flag (R)'], axis = 1)

pca = PCA(n_components = len(dfspc.columns))

pc = pca.fit(dfspc)
# pcdf = pd.DataFrame(
#     data = pc, 
# #     columns = ['principal component 1', 'principal component 2'], 
# )

In [None]:
plt.bar(range(len(dfspc.columns)), pc.explained_variance_)

In [None]:
Xpc = deepcopy(pcdf)

# target is whether incumbent wins
ypc = dfs[['winner_flag (R)']]

Xpc_train1, Xpc_test1, ypc_train1, ypc_test1 = train_test_split(
    Xpc[ypc['winner_flag (R)'] == 1], 
    ypc[ypc['winner_flag (R)'] == 1], 
    random_state = 421, 
    test_size = 0.33
)

Xpc_train2, Xpc_test2, ypc_train2, ypc_test2 = train_test_split(
    Xpc[ypc['winner_flag (R)'] == 0], 
    ypc[ypc['winner_flag (R)'] == 0], 
    random_state = 421, 
    test_size = 0.33
)

Xpc_train = pd.DataFrame()
Xpc_test = pd.DataFrame()
ypc_train = pd.DataFrame()
ypc_test = pd.DataFrame()

all_four = [
    [Xpc_train1, Xpc_train2],
    [Xpc_test1, Xpc_test2],
    [ypc_train1, ypc_train2],
    [ypc_test1, ypc_test2],
]

for i in range(2):
    Xpc_train = pd.concat([Xpc_train, all_four[0][i]], axis = 0)
    Xpc_test = pd.concat([Xpc_test, all_four[1][i]], axis = 0)
    ypc_train = pd.concat([ypc_train, all_four[2][i]], axis = 0)
    ypc_test = pd.concat([ypc_test, all_four[3][i]], axis = 0)

### Logistic Regression: PCA

In [None]:
lrpc = LogisticRegressionCV(cv = 11, random_state = 421)

lrpc.fit(Xpc_train, ypc_train)
y_pred = lrpc.predict(Xpc_test)
lrpc.score(Xpc_test, ypc_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\n\
    Logistic regression confusion matrix',
)

In [None]:
lr_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

lr_wrong = lr_y[lr_y['test'] != lr_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[lr_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

In [None]:
coefs = pd.DataFrame(zip(Xpc, lrpc.coef_[0]), columns = ['col', 'coef'])
coefs['abs_coef'] = [abs(x) for x in coefs['coef']]

coefs.sort_values(['abs_coef'], ascending = False, inplace = True)
coefs['color'] = (coefs['coef'] == coefs['abs_coef'])
coefs['color'] = ['green' if x == True else 'purple' for x in coefs['color']]
coefs.drop(['abs_coef'], axis = 1, inplace = True)
coefs = coefs[::-1]
coefs.reset_index(drop = True, inplace = True)

In [None]:
plt.figure(figsize = (12, 24))
plt.barh(
    coefs.index, 
    coefs['coef'].apply(lambda x: abs(x)), 
    color = coefs['color'], 
    alpha = 0.8
)

plt.title('2018 U.S. House of Representatives elections:\n\
Logistic regression feature importance', fontsize = 14)
plt.ylabel('Feature', fontsize = 12)
plt.yticks(range(len(coefs)), coefs['col'].values)
plt.xlabel('Coefficient', fontsize = 12)
# plt.xticks()
plt.tight_layout()
plt.show();

### Random forest

In [None]:
rfcpc = RandomForestClassifier(random_state=421)

rfcpc.fit(Xpc_train, ypc_train)
ypc_pred = rfcpc.predict(Xpc_test)
rfcpc.score(Xpc_test, ypc_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(ypc_test, ypc_pred),
    title = '2018 U.S. House of Representatives elections:\n\
    Random forest confusion matrix',
)

In [None]:
rfcpc_y = pd.DataFrame(
    zip(ypc_test['winner_flag (R)'], ypc_pred), 
    index = ypc_test.index, 
    columns = ['test', 'pred'],
)

rfcpc_wrong = rfcpc_y[rfcpc_y['test'] != rfcpc_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[rfcpc_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

In [None]:
coefs = pd.DataFrame(zip(Xpc, rfcpc.feature_importances_), columns = ['col', 'coef'])
coefs['abs_coef'] = [abs(x) for x in coefs['coef']]

coefs.sort_values(['abs_coef'], ascending = False, inplace = True)
coefs['color'] = (coefs['coef'] == coefs['abs_coef'])
coefs['color'] = ['green' if x == True else 'purple' for x in coefs['color']]
coefs.drop(['abs_coef'], axis = 1, inplace = True)
coefs = coefs[::-1]
coefs.reset_index(drop = True, inplace = True)

In [None]:
plt.figure(figsize = (8, 24))
plt.barh(
    coefs.index, 
    coefs['coef'].apply(lambda x: abs(x)), 
    color = coefs['color'], 
    alpha = 0.8
)

plt.title('2018 House of Representatives elections:\n\
Random forest feature importance', fontsize = 14)
plt.ylabel('Feature', fontsize = 12)
plt.yticks(range(len(coefs)), coefs['col'].values)
plt.xlabel('Importance', fontsize = 12)
plt.xticks(rotation = -45)
plt.tight_layout()
plt.show();

### Support vector machines

In [None]:
SVMpc = svm.LinearSVC(random_state = 421)
SVMpc.fit(Xpc_train, ypc_train)
ypc_pred = SVMpc.predict(Xpc_test)
SVMpc.score(Xpc_test,ypc_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\nSVM confusion matrix',
    labels = ['Win', 'Loss']
)

In [None]:
svm_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

svm_wrong = svm_y[svm_y['test'] != svm_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[svm_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

### Neural network

In [None]:
NN = MLPClassifier()

NN.fit(X_train, y_train)
y_pred = NN.predict(X_test)
NN.score(X_test, y_test)

In [None]:
plot_confusion_matrix(
    confusion_matrix(y_test, y_pred),
    title = '2018 U.S. House of Representatives elections:\nNN confusion matrix',
    labels = ['Win', 'Loss']
)

In [None]:
nn_y = pd.DataFrame(
    zip(y_test['winner_flag (R)'], y_pred), 
    index = y_test.index, 
    columns = ['test', 'pred'],
)

nn_wrong = nn_y[nn_y['test'] != nn_y['pred']]

pre_dfs[[
    'contest', 
    'ttl_receipts (R)',
    'ttl_receipts (D)',
    'winner_flag (R)'
]].iloc[nn_wrong.index].sort_values(['ttl_receipts (R)'], ascending = False)

### Scatterplots

In [None]:
marker_dict = {
    'I' : 'o',
    'C' : 'D',
    'O' : '+',
}

color_dict = {
    'Republican' : '#FF6661',
    'Democrat' : '#5494F7',
#     'Third party' : '#15DCDC',
}

alpha_dict = {
    1 : 1.0,
    0 : 0.2,
}

df_orig['marker'] = [marker_dict[df_orig.loc[i, 'cand_ici']] for i in list(df_orig.index)]
df_orig['color'] = [color_dict[df_orig.loc[i, 'cand_pty_affiliation']] for \
                    i in list(df_orig.index)]
df_orig['alpha'] = [alpha_dict[df_orig.loc[i, 'winner_flag']] for \
                    i in list(df_orig.index)]

In [None]:
orig_contin = list(set([x.replace(' (R)', '').replace(' (D)', '') for x in contin.columns]))
orig_contin = [x for x in orig_contin if x != 'r:d_funding_ratio']
for i in range(len(orig_contin)):
    for j in range(i + 1, len(orig_contin)):
        start_df = deepcopy(df_orig)
        for k in [i, j]:
            median = start_df[orig_contin[k]].median()
            std = start_df[orig_contin[k]].std()
            start_df = start_df[
                (start_df[orig_contin[k]] > (median - 3*std)) & \
                (start_df[orig_contin[k]] < (median + 3*std))
            ]
            start_df = start_df[start_df[orig_contin[k]] > 0]
        if len(start_df) > 1:
            plt.figure(figsize = (3, 3))
            for ici in start_df['cand_ici'].value_counts().index:
                lil_df = start_df[start_df['cand_ici'] == ici]
                marker = marker_dict[ici]
                for status in start_df['winner_flag'].value_counts().index:
                    liller_df = lil_df[lil_df['winner_flag'] == status]
                    alpha = alpha_dict[status]
                    plt.scatter(
                        liller_df[orig_contin[i]], 
                        liller_df[orig_contin[j]],
                        color = liller_df['color'],
                        alpha = alpha,
                        marker = marker,
                        s = 30,
                    )
            plt.xlabel(orig_contin[i], fontsize = 12)
            plt.xticks([])
            plt.ylabel(orig_contin[j], fontsize = 12)
            plt.yticks([])

            plt.tight_layout()
            plt.show();

In [None]:
# do ratio features
# what are repeating pol_pty_contrib values