In [1]:
%load_ext autoreload
%autoreload 1

In [33]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
import sys
sys.path.append('../src')
from data import fetch_model_data
from model import evaluation
%aimport data.fetch_model_data
%aimport model.evaluation

# Naive Bayes

In [11]:
metrics=['roc_auc', 'accuracy', 'precision']

## Train Test Split

In [12]:
ip_address = '18.218.116.177'
raw = fetch_model_data.fetch_model_data(ip_address)

In [13]:
first_feature_set = [
    'author_ideology',
    'author_party',
    'author_is_chair',
    'author_years_sen',
    'author_total_funding',
    'total_sponsors',
    'sponsor_chairs',
    'agg_funding_sponsors',
    'agg_exp_sponsors',
    'total_slips',
    'slips_perc_pro',
    'bipartisan',
    'ideol_range',
    'first_word_approp',
]

In [14]:
X, y = raw[first_feature_set], raw.third_reading
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Gaussian

In [15]:
scores = cross_validate(GaussianNB(), X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.796013,0.028032,0.799573
1,accuracy,0.803282,0.016401,0.807401
2,precision,0.848561,0.004257,0.85267


## Bernoulli

In [17]:
scores = cross_validate(BernoulliNB(), X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.833535,0.010121,0.833343
1,accuracy,0.808991,0.014163,0.812052
2,precision,0.825905,0.012571,0.826501


In [18]:
# Higher ROC but worse precision
    # Might be able to tune with threshold

## Multinomial

In [None]:
scores = cross_validate(MultinomialNB(), X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)
# Errors b/c of negative values

## Stacked

In [24]:
cat_features = [
    'author_party',
    'author_is_chair',
    'bipartisan',
    'first_word_approp',
]

count_features = [
    'author_years_sen',
    'total_sponsors',
    'sponsor_chairs',
    'agg_exp_sponsors',
    'total_slips',
]

cont_features = [
    'author_ideology',
    'author_total_funding',
    'agg_funding_sponsors',
    'slips_perc_pro',
    'ideol_range',
]

### Individually

In [51]:
scores = cross_validate(BernoulliNB(), X_train[cat_features], y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)
# Just categoricals

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.672511,0.01431,0.674444
1,accuracy,0.667358,0.018401,0.66738
2,precision,0.807921,0.016595,0.808272


In [None]:
# strange that this did so much worse
# I guess BernoulliNB can get meaningful info from continuous/count data
    # also might be that many of the features have lots of observations at 0
    # So the count data behaves like categorical

In [26]:
scores = cross_validate(GaussianNB(), X_train[cont_features], y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)
# Just continuous

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.80046,0.033529,0.800338
1,accuracy,0.814711,0.018691,0.812053
2,precision,0.854451,0.009675,0.852317


In [50]:
scores = cross_validate(MultinomialNB(), X_train[count_features], y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)
# Just count

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.805835,0.01921,0.805682
1,accuracy,0.656636,0.011113,0.65236
2,precision,0.810848,0.014798,0.804794


### Voting

In [38]:
bern_pipe = Pipeline([
    ('categorical_features', ColumnTransformer([('feature_subset', 'passthrough', cat_features)])),
    ('bern', BernoulliNB())
])
multi_pipe = Pipeline([
    ('count_features', ColumnTransformer([('feature_subset', 'passthrough', count_features)])),
    ('bern', MultinomialNB())
])
gauss_pipe = Pipeline([
    ('cont_features', ColumnTransformer([('feature_subset', 'passthrough', cont_features)])),
    ('bern', BernoulliNB())
])

In [49]:
vc = VotingClassifier(
    estimators=[('bern', bern_pipe), ('multi', multi_pipe), ('gauss', gauss_pipe)],
    voting='soft'
)
scores = cross_validate(vc, X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)
# Soft voting

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.835331,0.011689,0.83535
1,accuracy,0.708134,0.015508,0.706902
2,precision,0.802021,0.01035,0.800004


In [48]:
vc = VotingClassifier(
    estimators=[('bern', bern_pipe), ('multi', multi_pipe), ('gauss', gauss_pipe)],
    voting='hard'
)
scores = cross_validate(vc, X_train, y_train, return_train_score=True, scoring=['accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores, metrics=['accuracy', 'precision'])
# Hard voting -> looks a little better

Unnamed: 0,metric,mean,std,in_sample
0,accuracy,0.712424,0.015022,0.709047
1,precision,0.806737,0.010437,0.802749


In [52]:
# Gaussian was the best one individually so let's weight that one higher

In [54]:
vc = VotingClassifier(
    estimators=[('bern', bern_pipe), ('multi', multi_pipe), ('gauss', gauss_pipe)],
    voting='hard',
    weights=[1, 1, 5]
)
scores = cross_validate(vc, X_train, y_train, return_train_score=True, scoring=['accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores, metrics=['accuracy', 'precision'])
# Hard voting -> looks a little better

Unnamed: 0,metric,mean,std,in_sample
0,accuracy,0.806129,0.015025,0.80615
1,precision,0.825847,0.014117,0.825339


In [None]:
# Seems like it's just approaching the Gaussian by itself
    # Other things aren't adding much

**Takeaways**

Seems like Bernoulli and Multinomial aren't very good by themselves. Could try stacking but I doubt it will help.

Gaussian individually with just continuous features did just as good as when it had all of them.

Bernoulli did much better with all the data. Seems like it makes everything <=zero a 0 and everything else a 1 by default.

TODO: try mixing Guassian on just continuous, Bernoulii on everything. If promising, could coerce features to binary manually according to some unique threshhold for each column

In [59]:
vc = VotingClassifier(
    estimators=[('bern', BernoulliNB()), ('gauss', gauss_pipe)],
    voting='soft',
)
scores = cross_validate(vc, X_train, y_train, return_train_score=True, scoring=['roc_auc', 'accuracy', 'precision'], cv=5)
evaluation.report_single_model_metrics(scores)
# Hard voting -> looks a little better

Unnamed: 0,metric,mean,std,in_sample
0,roc_auc,0.8312,0.010319,0.830936
1,accuracy,0.806126,0.016046,0.80776
2,precision,0.824793,0.013367,0.825192


In [60]:
# Still didn't do meaningfully better than just straight Bernoulli

#### Conclusion: Bernoulli on everything is best