In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score 
from sklearn.naive_bayes import GaussianNB

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, ClusterCentroids

from notify_run import Notify

plt.style.use('ggplot')

"""
grids have been reduced to best performing models.
"""

In [2]:
notify = Notify()
notify.register()

In [3]:
df = pd.read_csv('../data/df_freddie_mac.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
X_15yr_loan = df[df['original_loan_term'] == 180] 

X_15yr_loan = X_15yr_loan.dropna(subset=['target'])
y = X_15yr_loan.pop('target')
print(X_15yr_loan.shape, y.shape)

(1286120, 32) (1286120,)


In [5]:
## No Preprocessing
cols = ['original_combined_loan-to-value_(cltv)', 'original_debt-to-income_(dti)_ratio'
        , 'original_upb', 'original_loan-to-value_(ltv)', 'credit_score', 'number_of_units'
        , 'original_interest_rate', 'number_of_borrowers']

x_train, x_test, y_train, y_test = train_test_split(X_15yr_loan[cols]
                                                    , y)

x_train.isnull().sum()
y_train.isnull().sum()

precision_scores = []
recall_scores = []
f1_scores = []

model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', RandomForestClassifier(n_jobs = -1))
    ])

# best params from prior runs
params = [{'classification__criterion': ['gini']
          , 'classification__max_depth': [3]
          , 'classification__max_features' : [None]}]


grid = GridSearchCV(model, params, verbose=1)
grid.fit(x_train.sample(50_000), y_train.sample(50_000))

y_hat = grid.predict(x_test)

print(precision_score(y_test, y_hat), recall_score(y_test, y_hat), f1_score(y_test, y_hat))

notify.send('Model is done!')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.1670146386857395 0.2684011733803708 0.20590398305583338


In [6]:
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []

lr_model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', LogisticRegression(n_jobs = -1))
    ])

lr_params = [{'classification__penalty': ['l1', 'l2', 'none']
          , 'classification__C': [.01]}]

lr_grid = GridSearchCV(lr_model, lr_params, verbose=1)
lr_grid.fit(x_train.sample(50_000), y_train.sample(50_000))



Fitting 5 folds for each of 3 candidates, totalling 15 fits


Traceback (most recent call last):
  File "/home/austin/anaconda3/envs/dsienv/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/austin/anaconda3/envs/dsienv/lib/python3.7/site-packages/imblearn/pipeline.py", line 266, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/home/austin/anaconda3/envs/dsienv/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/austin/anaconda3/envs/dsienv/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/austin/anaconda3/envs/dsienv/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 5

GridSearchCV(estimator=Pipeline(steps=[('sampling', ADASYN()),
                                       ('classification',
                                        LogisticRegression(n_jobs=-1))]),
             param_grid=[{'classification__C': [0.01],
                          'classification__penalty': ['l1', 'l2', 'none']}],
             verbose=1)

In [7]:
lr_y_hat = lr_grid.predict(x_test)
print(precision_score(y_test, lr_y_hat), recall_score(y_test, lr_y_hat), f1_score(y_test, lr_y_hat))

0.1638210761706956 0.9466802155031923 0.27930840397218


In [9]:
nb_precision_scores = []
nb_recall_scores = []
nb_f1_scores = []

nb_model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', GaussianNB())
    ])


nb_grid = GridSearchCV(nb_model
                       , param_grid = [{'classification__var_smoothing': [.0000000001, .000000001]}]
                       , verbose=1)
nb_grid.fit(x_train.sample(50_000), y_train.sample(50_000))

nb_y_hat = nb_grid.predict(x_test)

print(precision_score(y_test, nb_y_hat), recall_score(y_test, nb_y_hat), f1_score(y_test, nb_y_hat))

notify.send('Model is done!')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
0.16630606761543668 0.7678164004831566 0.2733956854178045


In [10]:
nb_model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'sampling', 'classification', 'sampling__n_jobs', 'sampling__n_neighbors', 'sampling__random_state', 'sampling__sampling_strategy', 'classification__priors', 'classification__var_smoothing'])

In [11]:
nb2_precision_scores = []
nb2_recall_scores = []
nb2_f1_scores = []

nb2_model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', GaussianNB())
    ])


nb2_grid = GridSearchCV(nb2_model
                       , param_grid = [{'classification__var_smoothing': [.0000000001]}]
                       , verbose=1)
nb2_grid.fit(x_train.sample(800_000), y_train.sample(800_000))

nb2_y_hat = nb2_grid.predict(x_test)

print(precision_score(y_test, nb2_y_hat), recall_score(y_test, nb2_y_hat), f1_score(y_test, nb2_y_hat))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.16326642535921904 0.9703203788561459 0.2795035000759385


In [12]:
nb2_y_hat_proba = nb2_grid.predict_proba(x_test)

In [13]:
nb2_grid.classes_

array([0., 1.])

In [14]:
nb2_y_hat_proba

array([[0.35522608, 0.64477392],
       [0.44748215, 0.55251785],
       [0.35477638, 0.64522362],
       ...,
       [0.3215896 , 0.6784104 ],
       [0.34465962, 0.65534038],
       [0.32716147, 0.67283853]])

In [30]:
nb3_y_hat = np.where(nb2_y_hat_proba[:, 1] >= .5, 1, 0)

In [31]:
nb3_y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [32]:
print(precision_score(y_test, nb3_y_hat), recall_score(y_test, nb3_y_hat), f1_score(y_test, nb3_y_hat))

0.16326642535921904 0.9703203788561459 0.2795035000759385


In [33]:
nb3_precision_scores = []
nb3_recall_scores = []
nb3_f1_scores = []

nb3_model = Pipeline([
        ('sampling', RandomUnderSampler()),
        ('classification', GaussianNB())
    ])


nb3_grid = GridSearchCV(nb3_model
                       , param_grid = [{'classification__var_smoothing': [.0000000001]}]
                       , verbose=1)
nb3_grid.fit(x_train.sample(800_000), y_train.sample(800_000))

nb3_y_hat = nb3_grid.predict(x_test)

print(precision_score(y_test, nb3_y_hat), recall_score(y_test, nb3_y_hat), f1_score(y_test, nb3_y_hat))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.16405172823658062 0.9261844047778821 0.27873255649574896
