In [3]:
%load_ext autoreload
%autoreload 1

In [4]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [5]:
import sys
sys.path.append('../src')
from data import fetch_model_data
from model import evaluation
%aimport data.fetch_model_data
%aimport model.evaluation

# Random Forest

In [6]:
metrics=['roc_auc', 'accuracy', 'precision']

## Train Test Split

In [7]:
ip_address = '18.218.116.177'
raw = fetch_model_data.fetch_model_data(ip_address)

In [8]:
first_feature_set = [
    'author_ideology',
    'author_party',
    'author_is_chair',
    'author_years_sen',
    'author_total_funding',
    'total_sponsors',
    'sponsor_chairs',
    'agg_funding_sponsors',
    'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
    'bipartisan',
    'ideol_range',
    'first_word_approp',
]

In [9]:
X, y = raw[first_feature_set], raw.third_reading
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Initial

In [11]:
scores = cross_validate(
    RandomForestClassifier(n_estimators=100),
    X_train,
    y_train,
    return_train_score=True,
    scoring=['roc_auc', 'accuracy', 'precision'],
    cv=5,
)
evaluation.report_single_model_metrics(scores)

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.871098,0.014067,0.935296
1,accuracy,0.827588,0.01375,0.916129
2,precision,0.83937,0.014901,0.984336


In [9]:
# Somewhat overfit -> .93 => 0.87

## Tune Parameters

In [16]:
evaluation.run_pipeline(
    raw_data=raw,
    features=first_feature_set,
    estimator=RandomForestClassifier,
    param_grid={
        'n_estimators': [100],
        'max_depth': range(3, 20),
        'min_samples_leaf': range(2, 25)
    }
)

Best params: {'max_depth': 5, 'min_samples_leaf': 6, 'n_estimators': 100}


Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.879254,0.010841,0.905769
1,accuracy,0.834736,0.018264,0.839054
2,precision,0.832769,0.017996,0.838075


In [20]:
# Less overfit (.9 -> .87)
# only sightly better