In [1]:
%load_ext autoreload
%autoreload 1

In [55]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [18]:
import sys
sys.path.append('../src')
from data import fetch_model_data
from model import evaluation
%aimport data.fetch_model_data
%aimport model.evaluation

# Decision Tree

In [4]:
metrics=['roc_auc', 'accuracy', 'precision']

## Train Test Split

In [5]:
ip_address = '18.218.116.177'
raw = fetch_model_data.fetch_model_data(ip_address)

In [6]:
first_feature_set = [
    'author_ideology',
    'author_party',
    'author_is_chair',
    'author_years_sen',
    'author_total_funding',
    'total_sponsors',
    'sponsor_chairs',
    'agg_funding_sponsors',
    'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
    'bipartisan',
    'ideol_range',
    'first_word_approp',
]

In [7]:
X, y = raw[first_feature_set], raw.third_reading
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Initial

In [8]:
scores = cross_validate(DecisionTreeClassifier(), X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.778351,0.019629,0.95445
1,accuracy,0.79254,0.013709,0.916129
2,precision,0.849366,0.009913,0.995764


In [9]:
# Super overfit -> .95 => 0.78

## Tune Parameters

In [10]:
evaluation.run_pipeline(
    raw_data=raw,
    features=first_feature_set,
    estimator=DecisionTreeClassifier,
    param_grid={ 'max_depth': range(3, 20), 'min_samples_leaf': range(2, 15) }
)

Best params: {'max_depth': 5, 'min_samples_leaf': 10}


Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.873949,0.006306,0.899631
1,accuracy,0.824739,0.011899,0.845671
2,precision,0.844085,0.015868,0.866343


In [20]:
# Damn!
# Not very overfit and the best results so far

## Explore Tuned Model

In [12]:
dt_params = {'max_depth': 5, 'min_samples_leaf': 10}
dt = DecisionTreeClassifier(**dt_params)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [13]:
r = export_text(dt, feature_names=first_feature_set)
print(r)

|--- author_total_funding <= 3058489.12
|   |--- total_slips <= 5.50
|   |   |--- slips_perc_pro <= 0.55
|   |   |   |--- author_ideology <= 0.43
|   |   |   |   |--- agg_funding_sponsors <= 2630079.75
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- agg_funding_sponsors >  2630079.75
|   |   |   |   |   |--- class: 1
|   |   |   |--- author_ideology >  0.43
|   |   |   |   |--- author_ideology <= 0.51
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- author_ideology >  0.51
|   |   |   |   |   |--- class: 1
|   |   |--- slips_perc_pro >  0.55
|   |   |   |--- agg_exp_sponsors <= 8.50
|   |   |   |   |--- author_total_funding <= 275584.00
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- author_total_funding >  275584.00
|   |   |   |   |   |--- class: 1
|   |   |   |--- agg_exp_sponsors >  8.50
|   |   |   |   |--- class: 1
|   |--- total_slips >  5.50
|   |   |--- author_ideology <= -0.57
|   |   |   |--- author_total_funding <= 1177822.38
|   |   |   |   |--- slips_p

In [19]:
# This looks pretty non-sensical
    # I don't remember author_funding or author_ideology having a big signal/noise ratio
# I'm skeptical of these results

In [24]:
for n, f in sorted(zip(first_feature_set, dt.feature_importances_), key=lambda x: x[1]):
    print(n, ':', f)


author_party : 0.0
author_is_chair : 0.0
author_years_sen : 0.0
total_sponsors : 0.0
sponsor_chairs : 0.0
bipartisan : 0.0
ideol_range : 0.0
agg_exp_sponsors : 0.000995173174290333
agg_funding_sponsors : 0.0012657381832484678
first_word_approp : 0.0035298076429553745
slips_perc_pro : 0.021701780934535113
author_ideology : 0.026850459061476647
total_slips : 0.10117304980590447
author_total_funding : 0.8444839911975895


## Different Features

In [25]:
second_feature_set = [
    'author_ideology',
    'author_party',
    'author_is_chair',
    'author_years_sen',
#     'author_total_funding',
    'total_sponsors',
    'sponsor_chairs',
    'agg_funding_sponsors',
    'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
    'bipartisan',
    'ideol_range',
    'first_word_approp',
]

In [61]:
evaluation.run_pipeline(
    raw_data=raw,
    features=second_feature_set,
    estimator=DecisionTreeClassifier,
    param_grid={
        'max_depth': range(3, 20),
        'min_samples_leaf': range(2, 15),
    }
)

Best params: {'max_depth': 5, 'min_samples_leaf': 13}


Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.868916,0.015438,0.891504
1,accuracy,0.830458,0.018431,0.83977
2,precision,0.8454,0.025977,0.852768


In [None]:
# Not that much worse after removing most important feature

## Super Simple Tree

In [82]:
scores = cross_validate(
    DecisionTreeClassifier(**{'max_depth': 5, 'min_samples_leaf': 13, 'min_impurity_decrease': 0.01}),
    X_train[second_feature_set],
    y_train,
    return_train_score=True,
    scoring=['roc_auc', 'accuracy', 'precision'],
    cv=5,
    return_estimator=True
)
dt = scores['estimator'][0]
r = export_text(dt, feature_names=second_feature_set)
print(r)
evaluation.report_single_model_metrics(scores)
# Shocking how well the model does on two splits

|--- total_slips <= 0.50
|   |--- author_years_sen <= 14.50
|   |   |--- class: 1
|   |--- author_years_sen >  14.50
|   |   |--- class: 0
|--- total_slips >  0.50
|   |--- class: 1



Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.82996,0.015726,0.833285
1,accuracy,0.832593,0.017079,0.832616
2,precision,0.831295,0.019413,0.830786


In [74]:
# perhaps good support that I should have a "has_slips" boolean