In [6]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle

In [8]:
import sys
sys.path.append('../src')
from data import fetch_model_data
from model import evaluation
%aimport data.fetch_model_data
%aimport model.evaluation

# Model Comparison

In [9]:
metrics=['roc_auc', 'accuracy', 'precision']

## Load Data

In [10]:
ip_address = '18.218.116.177'
raw = fetch_model_data.fetch_model_data(ip_address)

In [24]:
first_feature_set = [
#     'author_ideology',
    'author_party',
#     'author_is_chair',
    'author_years_sen',
    'author_total_funding',
    'total_sponsors',
#     'sponsor_chairs',
#     'agg_funding_sponsors',
#     'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
#     'bipartisan',
#     'ideol_range',
#     'first_word_approp',
]


In [25]:
X, y = raw[first_feature_set], raw.third_reading
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Run Models

In [26]:
models = [
    ('knn', {'n_neighbors': 17, 'weights': 'uniform'}, KNeighborsClassifier),
    ('log', {'C': 1, 'penalty': 'l1'}, LogisticRegression),
    ('bNB', {}, BernoulliNB),
    ('rf', {'max_depth': 5, 'min_samples_leaf': 6, 'n_estimators': 100}, RandomForestClassifier),
    ('svm', {'C': 0.4 }, LinearSVC),
    ('dt', {'max_depth': 5, 'min_samples_leaf': 10}, DecisionTreeClassifier),
    ('stupid_dt', {'max_depth': 5, 'min_samples_leaf': 13, 'min_impurity_decrease': 0.01}, DecisionTreeClassifier),
]

In [27]:
results = []
s = []
for name, fit_params, estimator in models:
    scores = evaluation.cross_validate(
        estimator,
        X_train,
        y_train,
        fit_params=fit_params,
        scoring=metrics,
        standardize=True if name in ['knn', 'log', 'svm'] else False
    )
    s.append((name, scores))
    res = evaluation.report_single_model_metrics(scores)
    model = res[['mean']].transpose()
    model.columns = ['roc_auc', 'accuracy', 'precision']
    model.index = [name]
    results.append(model)
pd.concat(results)

  mean=np.array(scores['test_' + metric]).mean(),
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  in_sample=np.array(scores['train_' + metric]).mean()


Unnamed: 0,roc_auc,accuracy,precision
knn,0.588916,0.844587,0.84561
log,0.652934,0.840175,0.84699
bNB,0.528218,0.844587,0.844587
rf,0.668828,0.846057,0.848897
svm,,0.843116,0.84745
dt,0.599827,0.829945,0.861791
stupid_dt,0.545121,0.844587,0.844587


In [11]:
# Stupid model is very close to being the best

# Best Model

In [28]:
rf = s[3][1]['estimators'][0]

In [29]:
for n, f in sorted(zip(first_feature_set, rf.feature_importances_), key=lambda x: x[1]):
    print(n, ':', f)

author_party : 0.012629888099653224
total_sponsors : 0.06737896111282707
author_years_sen : 0.09760856692968063
slips_perc_pro : 0.22758124259236445
author_total_funding : 0.2957373665770202
total_slips : 0.29906397468845425


In [5]:
best_model = ('rf', {'max_depth': 5, 'min_samples_leaf': 6, 'n_estimators': 100}, RandomForestClassifier)

In [31]:
rf = RandomForestClassifier(**{'max_depth': 5, 'min_samples_leaf': 6, 'n_estimators': 100})
rf.fit(X[first_feature_set], y)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
with open('flask_model.pickle', 'wb') as write_file:
    pickle.dump(rf, write_file)