In [1]:
%load_ext autoreload
%autoreload 1

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

In [3]:
import sys
sys.path.append('../src')
from data import fetch_model_data
from model import evaluation
%aimport data.fetch_model_data
%aimport model.evaluation

# Logistic

In [7]:
metrics=['roc_auc', 'accuracy', 'precision']

## Train Test Split

In [4]:
ip_address = '18.218.116.177'
raw = fetch_model_data.fetch_model_data(ip_address)

In [5]:
first_feature_set = [
    'author_ideology',
    'author_party',
    'author_is_chair',
    'author_years_sen',
    'author_total_funding',
    'total_sponsors',
    'sponsor_chairs',
    'agg_funding_sponsors',
    'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
    'bipartisan',
    'ideol_range',
    'first_word_approp',
]

In [6]:
X, y = raw[first_feature_set], raw.third_reading
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Initial Model

In [9]:
scores = cross_validate(LogisticRegression(), X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)



Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.799254,0.0291,0.79898
1,accuracy,0.601562,0.018437,0.60014
2,precision,0.807826,0.033364,0.806634


In [None]:
# Slightly lower ROC score
# WAY lower accuracy score

# With Tuned Parameters

In [14]:
evaluation.run_pipeline(
    raw_data=raw,
    features=first_feature_set,
    estimator=LogisticRegression,
    param_grid={ 'C': [x/10 for x in range(1, 11)], 'penalty': ['l1', 'l2']}
)







Best params: {'C': 0.8, 'penalty': 'l1'}




Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.845707,0.033489,0.859915
1,accuracy,0.812565,0.025905,0.815628
2,precision,0.836627,0.023015,0.838744


In [13]:
# Best metrics so far (by a little)
# A little more variance in scores (maybe slightly more over fit)

In [19]:
best_params = {'C': 0.8, 'penalty': 'l1'}

## Different Feature Sets

In [24]:
second_feature_set = [
#     'author_ideology',
    'author_party',
#     'author_is_chair',
    'author_years_sen',
    'author_total_funding',
    'total_sponsors',
    'sponsor_chairs',
#     'agg_funding_sponsors',
#     'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
#     'bipartisan',
    'ideol_range',
#     'first_word_approp',
]

evaluation.run_pipeline(
    raw_data=raw,
    features=second_feature_set,
    estimator=LogisticRegression,
    param_grid={ 'C': [x/10 for x in range(1, 11)], 'penalty': ['l1', 'l2']}
)







Best params: {'C': 1.0, 'penalty': 'l1'}




Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.83949,0.025849,0.845648
1,accuracy,0.798971,0.02554,0.806866
2,precision,0.833113,0.024142,0.8377


In [25]:
# Again, all features seems to be slightly better
# TODO:
    # engineer that author_ideology feature to be more linear

#### Conclusion: All features with {'C': 0.8, 'penalty': 'l1'} is best