In [54]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score 
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, ComplementNB

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, ClusterCentroids

from notify_run import Notify

plt.style.use('ggplot')

In [2]:
notify = Notify()
notify.register()

In [3]:
df = pd.read_csv('../data/df_preprocessed_freddie_mac.csv')

In [4]:
X = df[df['original_loan_term'] == 180].copy()
y = X.pop('target')

In [5]:
X.shape

(1286120, 51)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [7]:
nb_model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', GaussianNB())
    ])


nb_grid = GridSearchCV(nb_model
                       , param_grid = [{'classification__var_smoothing': [.0000000001]}]
                       , verbose=1
                       , n_jobs=-1)
nb_grid.fit(x_train.sample(800_000), y_train.sample(800_000))

nb_y_hat = nb_grid.predict(x_test)

print(precision_score(y_test, nb_y_hat), recall_score(y_test, nb_y_hat), f1_score(y_test, nb_y_hat))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.1605564543766388 0.7752294460585304 0.26601830852472114


In [77]:
nb_model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', ComplementNB())
    ])


nb_grid = GridSearchCV(nb_model
                       , param_grid = [{'classification__alpha': [10]}]
                       , verbose=2
                       , n_jobs=-1)
nb_grid.fit(x_train.sample(500_000), y_train.sample(500_000))

nb_y_hat = nb_grid.predict(x_test)

print(precision_score(y_test, nb_y_hat), recall_score(y_test, nb_y_hat), f1_score(y_test, nb_y_hat))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.15808612085119014 0.491851538298732 0.2392687994908179


In [76]:
nb_grid.best_params_

{'classification__alpha': 10}

In [38]:
precision_scores = []
recall_scores = []
f1_scores = []

model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', RandomForestClassifier(n_jobs = -1))
    ])

# best params from prior runs
params = [{'classification__criterion': ['gini']
          , 'classification__max_depth': [5,10,15,None]
          , 'classification__max_features': ['sqrt','log2',None]
          , 'classification__n_estimators': [10] }]


grid = GridSearchCV(model, params, verbose=1, n_jobs = -1)
grid.fit(x_train.sample(100_000), y_train.sample(100_000))

y_hat = grid.predict(x_test)

print(precision_score(y_test, y_hat), recall_score(y_test, y_hat), f1_score(y_test, y_hat))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
0.12811818842334705 0.02035672368345102 0.03513141073533562


In [39]:
grid.best_params_

{'classification__criterion': 'gini',
 'classification__max_depth': 10,
 'classification__max_features': None,
 'classification__n_estimators': 10}

In [41]:
model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', RandomForestClassifier(n_jobs = -1))
    ])

# best params from prior runs
params = [{'classification__criterion': ['gini']
          , 'classification__max_depth': [10]
          , 'classification__max_features': [None]
          , 'classification__n_estimators': [100] }]


grid = GridSearchCV(model, params, verbose=1, n_jobs = -1)
grid.fit(x_train, y_train)

y_hat = grid.predict(x_test)

print(precision_score(y_test, y_hat), recall_score(y_test, y_hat), f1_score(y_test, y_hat))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.3564044284771611 0.3741173301521944 0.3650461376714322


In [45]:
filename = 'finalized_rf_adasyn_model.sav'
pickle.dump(grid, open(filename, 'wb'))

In [18]:
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []

lr_model = Pipeline([
        ('sampling', ADASYN()),
        ('classification', LogisticRegression(n_jobs = -1))
    ])

lr_params = [{'classification__penalty': ['none']}]
          #, 'classification__C': [200]}]

lr_grid = GridSearchCV(lr_model, lr_params, verbose=1, n_jobs=-1)
lr_grid.fit(x_train.sample(100_000), y_train.sample(100_000))

lr_y_hat = lr_grid.predict(x_test)

print(precision_score(y_test, lr_y_hat), recall_score(y_test, lr_y_hat), f1_score(y_test, lr_y_hat))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.16164277050353 1.0 0.27830030816352214


In [10]:
lr_y_hat.sum()

321530.0

In [11]:
y_test.sum()

51973.0

In [12]:
lr_grid.best_estimator_

Pipeline(steps=[('sampling', ADASYN()),
                ('classification',
                 LogisticRegression(C=200, n_jobs=-1, penalty='none'))])

In [13]:
lr_grid.best_estimator_


Pipeline(steps=[('sampling', ADASYN()),
                ('classification',
                 LogisticRegression(C=200, n_jobs=-1, penalty='none'))])

# predicting 2005 data


In [27]:
df_05 = pd.read_csv('../data/df_preprocessed_freddie_mac_2005.csv')

In [28]:
X_05 = df_05[df_05['original_loan_term'] == 180].copy()
y_05 = X_05.pop('target')

In [95]:
from scipy.stats import binom
print(y_train.sum()/y_train.count())
print(y_test.shape[0])
#base_line_model = binom.rvs(1, p=y_train.sum()/y_train.count(), size=y_test.shape[0])
baseline_05 = binom.rvs(1, p=.16222125462631792, size=y_05.shape[0])
f1_score(y_05, baseline_05)

0.162676370271307
321530


0.10585442301425978

In [31]:
y_hat_05 = lr_grid.predict(X_05)
print(precision_score(y_05, y_hat_05), recall_score(y_05, y_hat_05), f1_score(y_05, y_hat_05))

0.07897229324757879 1.0 0.14638428390015754


In [36]:
y_05.sum()/y_05.shape[0]

0.07897229324757879

In [37]:
y_hat_05.sum()/y_hat_05.shape[0]

0.6812619094860343

In [32]:
y_hat_05 = nb_grid.predict(X_05)
print(precision_score(y_05, y_hat_05), recall_score(y_05, y_hat_05), f1_score(y_05, y_hat_05))

0.07377265383727703 0.6364067314529509 0.13221846331122306


In [46]:
y_hat_rf_adasyn = grid.predict(X_05)

print(precision_score(y_05, y_hat_rf_adasyn)
      , recall_score(y_05, y_hat_rf_adasyn)
      , f1_score(y_05, y_hat_rf_adasyn))

0.07513330101793504 0.41147191277553924 0.12706498905747934
