In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, SCORERS
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [2]:
home = 'C:\\Users\\iksri\\Documents\\Data Science\\Projects\\Credit Risk\\Home Loan Level Data - Freddie Mac\\Datasets'

In [3]:
y2010 = pd.DataFrame()
y2011 = pd.DataFrame()
y2012 = pd.DataFrame()
y2013 = pd.DataFrame()
y2014 = pd.DataFrame()
y2015 = pd.DataFrame()
y2016 = pd.DataFrame()
y2017 = pd.DataFrame()

data = [y2010,
        y2011,
        y2012,
        y2013,
        y2014,
        y2015,
        y2016,
        y2017]

In [4]:
i = 0
for yr in range(2010, 2017):
    data[i] = pd.read_csv(home + f"\\sample_{yr}.csv", index_col = 'Loan Sequence Number')
    i += 1

In [5]:
data[-1] = pd.read_csv(home + f"\\full_{yr}.csv", index_col = 'Loan Sequence Number')

If column not in test df, then remove column from train df

In [6]:
df = pd.concat(data).sample(n = 200000)

In [7]:
# some seller names (banks) have NULL values since that feature is not defined for some data partitions
# for instance, if bank X did not issue any loans in 2010, that feature is not present in y2010
# so that row has a NULL value under the feature bank X in the concatenated dataframe
# by substituting 0 for NULL values, we indicate the same thing; i.e. bank X did not issue a loan for a particular row
df.fillna(0, inplace = True)

## Model Training

In [8]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df.drop('Delinquent', axis = 1))

In [9]:
X = scaled_features
y = df['Delinquent'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [10]:
clf = LazyClassifier(verbose = 2,ignore_warnings = False, custom_metric = None)

In [11]:
# used smaller subsets of data for quick overview results
models, predictions = clf.fit(X_train[:10000], X_test[:2000], y_train[:10000], y_test[:2000])

  3%|▎         | 1/29 [00:00<00:21,  1.32it/s]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.8865, 'Balanced Accuracy': 0.501377151799687, 'ROC AUC': 0.501377151799687, 'F1 Score': 0.8350583747028286, 'Time taken': 0.7573685646057129}


  7%|▋         | 2/29 [00:01<00:22,  1.19it/s]

{'Model': 'BaggingClassifier', 'Accuracy': 0.88, 'Balanced Accuracy': 0.509358372456964, 'ROC AUC': 0.509358372456964, 'F1 Score': 0.8368401360401225, 'Time taken': 0.9032979011535645}
{'Model': 'BernoulliNB', 'Accuracy': 0.88, 'Balanced Accuracy': 0.5015962441314554, 'ROC AUC': 0.5015962441314554, 'F1 Score': 0.833504424548774, 'Time taken': 0.06116986274719238}


 21%|██        | 6/29 [00:11<00:41,  1.81s/it]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.887, 'Balanced Accuracy': 0.49971830985915494, 'ROC AUC': 0.49971830985915494, 'F1 Score': 0.834353471118177, 'Time taken': 9.591317892074585}
CategoricalNB model failed to execute
Negative values in data passed to CategoricalNB (input X)
{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.8005, 'Balanced Accuracy': 0.530547730829421, 'ROC AUC': 0.530547730829421, 'F1 Score': 0.8060500644260234, 'Time taken': 0.15801310539245605}
{'Model': 'DummyClassifier', 'Accuracy': 0.8875, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.8346026490066225, 'Time taken': 0.031951189041137695}
{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.8025, 'Balanced Accuracy': 0.5142097026604069, 'ROC AUC': 0.5142097026604069, 'F1 Score': 0.804185286088454, 'Time taken': 0.06470012664794922}


 31%|███       | 9/29 [00:12<00:22,  1.12s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.8855, 'Balanced Accuracy': 0.502754303599374, 'ROC AUC': 0.502754303599374, 'F1 Score': 0.8354792465857213, 'Time taken': 1.2471654415130615}
{'Model': 'GaussianNB', 'Accuracy': 0.1665, 'Balanced Accuracy': 0.5148982785602504, 'ROC AUC': 0.5148982785602504, 'F1 Score': 0.13166442093950095, 'Time taken': 0.04600930213928223}


 38%|███▊      | 11/29 [00:13<00:14,  1.26it/s]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.8785, 'Balanced Accuracy': 0.5104538341158059, 'ROC AUC': 0.510453834115806, 'F1 Score': 0.836801450895276, 'Time taken': 0.17528605461120605}


 41%|████▏     | 12/29 [00:19<00:31,  1.82s/it]

{'Model': 'LabelPropagation', 'Accuracy': 0.812, 'Balanced Accuracy': 0.517621283255086, 'ROC AUC': 0.517621283255086, 'F1 Score': 0.8097413335589517, 'Time taken': 6.27386474609375}


 48%|████▊     | 14/29 [00:26<00:34,  2.28s/it]

{'Model': 'LabelSpreading', 'Accuracy': 0.812, 'Balanced Accuracy': 0.517621283255086, 'ROC AUC': 0.517621283255086, 'F1 Score': 0.8097413335589517, 'Time taken': 6.933230876922607}
{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.8845, 'Balanced Accuracy': 0.5002503912363068, 'ROC AUC': 0.5002503912363068, 'F1 Score': 0.8340423841104874, 'Time taken': 0.1972970962524414}


 55%|█████▌    | 16/29 [00:29<00:23,  1.82s/it]

{'Model': 'LinearSVC', 'Accuracy': 0.8855, 'Balanced Accuracy': 0.5008137715179969, 'ROC AUC': 0.5008137715179969, 'F1 Score': 0.8345505714768118, 'Time taken': 2.858207941055298}
{'Model': 'LogisticRegression', 'Accuracy': 0.8875, 'Balanced Accuracy': 0.5019405320813771, 'ROC AUC': 0.5019405320813771, 'F1 Score': 0.8355657984917177, 'Time taken': 0.1419200897216797}


 69%|██████▉   | 20/29 [00:29<00:06,  1.45it/s]

{'Model': 'NearestCentroid', 'Accuracy': 0.6195, 'Balanced Accuracy': 0.6090453834115805, 'ROC AUC': 0.6090453834115805, 'F1 Score': 0.6894736338541764, 'Time taken': 0.09484219551086426}
NuSVC model failed to execute
specified nu is infeasible
{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.8505, 'Balanced Accuracy': 0.5063223787167449, 'ROC AUC': 0.5063223787167449, 'F1 Score': 0.8248862265428105, 'Time taken': 0.07036375999450684}
{'Model': 'Perceptron', 'Accuracy': 0.8415, 'Balanced Accuracy': 0.5090140845070422, 'ROC AUC': 0.5090140845070423, 'F1 Score': 0.8218309161707671, 'Time taken': 0.04770922660827637}
{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.164, 'Balanced Accuracy': 0.5076682316118936, 'ROC AUC': 0.5076682316118937, 'F1 Score': 0.12942857142857145, 'Time taken': 0.09448695182800293}


 83%|████████▎ | 24/29 [00:31<00:02,  2.08it/s]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.8875, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.8346026490066225, 'Time taken': 1.189483642578125}
{'Model': 'RidgeClassifier', 'Accuracy': 0.887, 'Balanced Accuracy': 0.49971830985915494, 'ROC AUC': 0.49971830985915494, 'F1 Score': 0.834353471118177, 'Time taken': 0.051892757415771484}
{'Model': 'RidgeClassifierCV', 'Accuracy': 0.887, 'Balanced Accuracy': 0.49971830985915494, 'ROC AUC': 0.49971830985915494, 'F1 Score': 0.834353471118177, 'Time taken': 0.1537470817565918}


 86%|████████▌ | 25/29 [00:31<00:01,  2.34it/s]

{'Model': 'SGDClassifier', 'Accuracy': 0.877, 'Balanced Accuracy': 0.49990610328638496, 'ROC AUC': 0.499906103286385, 'F1 Score': 0.8319275041927123, 'Time taken': 0.20046114921569824}


 90%|████████▉ | 26/29 [00:34<00:03,  1.06s/it]

{'Model': 'SVC', 'Accuracy': 0.8875, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.8346026490066225, 'Time taken': 3.3347270488739014}
StackingClassifier model failed to execute
__init__() missing 1 required positional argument: 'estimators'


 97%|█████████▋| 28/29 [00:36<00:00,  1.11it/s]

{'Model': 'XGBClassifier', 'Accuracy': 0.886, 'Balanced Accuracy': 0.5321439749608764, 'ROC AUC': 0.5321439749608764, 'F1 Score': 0.847966010316984, 'Time taken': 1.287501335144043}


100%|██████████| 29/29 [00:36<00:00,  1.25s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.89, 'Balanced Accuracy': 0.5188732394366197, 'ROC AUC': 0.5188732394366197, 'F1 Score': 0.8441078185660229, 'Time taken': 0.2133009433746338}





## Balancing Target Classes

It is clear from Balanced Accuracy scores of around 0.5 that none of these algorithms are able to overcome the class imbalance of the target features to make accurate predictions. Almost all models are overpredicting values of 0(no delinquency). We will use SMOTE to oversample the 1(delinquent) class and retest.

In [12]:
oversample = SMOTE()
print("BEFORE:", Counter(y))
X, y = oversample.fit_resample(X, y)
print("AFTER:", Counter(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

BEFORE: Counter({0: 176248, 1: 23752})
AFTER: Counter({0: 176248, 1: 176248})


### Benchmark - Logistic Regression

In [13]:
reg = LogisticRegression(max_iter = 5000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_labels = (y_pred > 0.5).astype(int)
print(confusion_matrix(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels))

[[23172 12008]
 [11348 23972]]
              precision    recall  f1-score   support

           0       0.67      0.66      0.66     35180
           1       0.67      0.68      0.67     35320

    accuracy                           0.67     70500
   macro avg       0.67      0.67      0.67     70500
weighted avg       0.67      0.67      0.67     70500



### Other Algorithms

In [14]:
clf = LazyClassifier(verbose = 2, ignore_warnings = False, custom_metric = None)

In [15]:
# used smaller subsets of data for quick overview results
models, predictions = clf.fit(X_train[:10000], X_test[:2000], y_train[:10000], y_test[:2000])

  3%|▎         | 1/29 [00:00<00:25,  1.10it/s]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.7675, 'Balanced Accuracy': 0.7674424069816628, 'ROC AUC': 0.7674424069816629, 'F1 Score': 0.7674074803577954, 'Time taken': 0.9126555919647217}


  7%|▋         | 2/29 [00:01<00:26,  1.03it/s]

{'Model': 'BaggingClassifier', 'Accuracy': 0.818, 'Balanced Accuracy': 0.818259364334279, 'ROC AUC': 0.8182593643342791, 'F1 Score': 0.8166604924346186, 'Time taken': 0.995002031326294}
{'Model': 'BernoulliNB', 'Accuracy': 0.6305, 'Balanced Accuracy': 0.6304261738355645, 'ROC AUC': 0.6304261738355644, 'F1 Score': 0.6302697321694158, 'Time taken': 0.06310081481933594}


 14%|█▍        | 4/29 [00:11<01:24,  3.36s/it]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.6625, 'Balanced Accuracy': 0.6623964615681541, 'ROC AUC': 0.6623964615681541, 'F1 Score': 0.6620867713511325, 'Time taken': 9.513684272766113}
CategoricalNB model failed to execute
Negative values in data passed to CategoricalNB (input X)


 21%|██        | 6/29 [00:11<00:41,  1.82s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.744, 'Balanced Accuracy': 0.7439796958172624, 'ROC AUC': 0.7439796958172623, 'F1 Score': 0.7439861748801652, 'Time taken': 0.16091513633728027}
{'Model': 'DummyClassifier', 'Accuracy': 0.5015, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.3350013320013319, 'Time taken': 0.044727325439453125}
{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.6225, 'Balanced Accuracy': 0.6224771022939206, 'ROC AUC': 0.6224771022939206, 'F1 Score': 0.6224766872710261, 'Time taken': 0.047269344329833984}


 31%|███       | 9/29 [00:13<00:23,  1.17s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.7635, 'Balanced Accuracy': 0.7634933714403429, 'ROC AUC': 0.763493371440343, 'F1 Score': 0.7634984035926727, 'Time taken': 1.4868156909942627}
{'Model': 'GaussianNB', 'Accuracy': 0.5325, 'Balanced Accuracy': 0.53123428110853, 'ROC AUC': 0.53123428110853, 'F1 Score': 0.4310518230873112, 'Time taken': 0.049071550369262695}


 38%|███▊      | 11/29 [00:13<00:14,  1.21it/s]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.604, 'Balanced Accuracy': 0.6037084333759004, 'ROC AUC': 0.6037084333759004, 'F1 Score': 0.600199164006427, 'Time taken': 0.1663525104522705}


 41%|████▏     | 12/29 [00:19<00:31,  1.85s/it]

{'Model': 'LabelPropagation', 'Accuracy': 0.616, 'Balanced Accuracy': 0.615876542888886, 'ROC AUC': 0.615876542888886, 'F1 Score': 0.6153382965102473, 'Time taken': 6.277955532073975}


 45%|████▍     | 13/29 [00:26<00:47,  2.95s/it]

{'Model': 'LabelSpreading', 'Accuracy': 0.616, 'Balanced Accuracy': 0.615876542888886, 'ROC AUC': 0.615876542888886, 'F1 Score': 0.6153382965102473, 'Time taken': 7.020371437072754}


 48%|████▊     | 14/29 [00:27<00:34,  2.31s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.662, 'Balanced Accuracy': 0.6618919570276133, 'ROC AUC': 0.6618919570276133, 'F1 Score': 0.6615498099255324, 'Time taken': 0.2070324420928955}


 59%|█████▊    | 17/29 [00:30<00:17,  1.49s/it]

{'Model': 'LinearSVC', 'Accuracy': 0.6575, 'Balanced Accuracy': 0.6574114167027503, 'ROC AUC': 0.6574114167027503, 'F1 Score': 0.6571922152255577, 'Time taken': 2.972757577896118}
{'Model': 'LogisticRegression', 'Accuracy': 0.6645, 'Balanced Accuracy': 0.6643904795143156, 'ROC AUC': 0.6643904795143156, 'F1 Score': 0.6640407746882817, 'Time taken': 0.09753012657165527}
{'Model': 'NearestCentroid', 'Accuracy': 0.64, 'Balanced Accuracy': 0.6398497586478278, 'ROC AUC': 0.6398497586478278, 'F1 Score': 0.6390802329498966, 'Time taken': 0.09695625305175781}


 69%|██████▉   | 20/29 [00:37<00:14,  1.66s/it]

{'Model': 'NuSVC', 'Accuracy': 0.6825, 'Balanced Accuracy': 0.6824146417317756, 'ROC AUC': 0.6824146417317756, 'F1 Score': 0.6822334489158539, 'Time taken': 6.818224906921387}
{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.591, 'Balanced Accuracy': 0.5907743169688526, 'ROC AUC': 0.5907743169688526, 'F1 Score': 0.5886556181600711, 'Time taken': 0.06315016746520996}
{'Model': 'Perceptron', 'Accuracy': 0.5845, 'Balanced Accuracy': 0.5844107596968373, 'ROC AUC': 0.5844107596968373, 'F1 Score': 0.5841266143831216, 'Time taken': 0.06304740905761719}
{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.519, 'Balanced Accuracy': 0.5178856609709488, 'ROC AUC': 0.5178856609709487, 'F1 Score': 0.4418887835691991, 'Time taken': 0.08397507667541504}


 76%|███████▌  | 22/29 [00:38<00:09,  1.38s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.8105, 'Balanced Accuracy': 0.8106257956321608, 'ROC AUC': 0.8106257956321606, 'F1 Score': 0.8101813797477514, 'Time taken': 1.697622537612915}
{'Model': 'RidgeClassifier', 'Accuracy': 0.6605, 'Balanced Accuracy': 0.6603904435139916, 'ROC AUC': 0.6603904435139917, 'F1 Score': 0.6600352995727917, 'Time taken': 0.051517486572265625}


 83%|████████▎ | 24/29 [00:39<00:04,  1.06it/s]

{'Model': 'RidgeClassifierCV', 'Accuracy': 0.6605, 'Balanced Accuracy': 0.6603904435139916, 'ROC AUC': 0.6603904435139917, 'F1 Score': 0.6600352995727917, 'Time taken': 0.1540372371673584}


 86%|████████▌ | 25/29 [00:39<00:03,  1.24it/s]

{'Model': 'SGDClassifier', 'Accuracy': 0.643, 'Balanced Accuracy': 0.6428467856210707, 'ROC AUC': 0.6428467856210706, 'F1 Score': 0.6420512850597327, 'Time taken': 0.23663330078125}


 90%|████████▉ | 26/29 [00:44<00:05,  1.83s/it]

{'Model': 'SVC', 'Accuracy': 0.686, 'Balanced Accuracy': 0.6859131732185589, 'ROC AUC': 0.6859131732185589, 'F1 Score': 0.6857271858255783, 'Time taken': 5.473434209823608}
StackingClassifier model failed to execute
__init__() missing 1 required positional argument: 'estimators'


 97%|█████████▋| 28/29 [00:45<00:01,  1.25s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.9135, 'Balanced Accuracy': 0.9136657229915068, 'ROC AUC': 0.913665722991507, 'F1 Score': 0.9132472640214591, 'Time taken': 0.6353204250335693}


100%|██████████| 29/29 [00:45<00:00,  1.58s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.908, 'Balanced Accuracy': 0.9081821736395628, 'ROC AUC': 0.9081821736395628, 'F1 Score': 0.9076734053355487, 'Time taken': 0.2473006248474121}





### XGBoost

Sample Predictions

In [16]:
dmatrix = xgb.DMatrix(X_train, label=y_train)
params = {"objective": "reg:logistic", "max_depth": 10}
xgb_model = xgb.train(params, dmatrix)
y_pred = xgb_model.predict(xgb.DMatrix(X_test))
y_pred_labels = (y_pred > 0.5).astype(int)
print(confusion_matrix(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels))

[[33785  1395]
 [ 5711 29609]]
              precision    recall  f1-score   support

           0       0.86      0.96      0.90     35180
           1       0.96      0.84      0.89     35320

    accuracy                           0.90     70500
   macro avg       0.91      0.90      0.90     70500
weighted avg       0.91      0.90      0.90     70500



Cross Validation For More Relibale Assessment of Model Prediction

In [17]:
cv_results = xgb.cv(params, dmatrix, 10, nfold = 3, metrics = ["logloss", "auc"], as_pandas = True, seed = 123)
cv_results

Unnamed: 0,train-logloss-mean,train-logloss-std,train-auc-mean,train-auc-std,test-logloss-mean,test-logloss-std,test-auc-mean,test-auc-std
0,0.61,0.0,0.83,0.0,0.61,0.0,0.83,0.0
1,0.55,0.0,0.88,0.0,0.55,0.0,0.87,0.0
2,0.49,0.01,0.91,0.01,0.5,0.01,0.9,0.0
3,0.45,0.01,0.92,0.0,0.45,0.01,0.92,0.0
4,0.41,0.01,0.93,0.0,0.42,0.01,0.93,0.0
5,0.38,0.01,0.94,0.0,0.39,0.01,0.93,0.0
6,0.36,0.01,0.95,0.0,0.37,0.01,0.94,0.0
7,0.34,0.0,0.95,0.0,0.35,0.0,0.94,0.0
8,0.32,0.0,0.95,0.0,0.33,0.0,0.95,0.0
9,0.3,0.0,0.96,0.0,0.32,0.0,0.95,0.0


### LGBM

Sample Predictions

In [18]:
train_data = lgb.Dataset(X_train, label = y_train)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbose': 0
}
model = lgb.train(params, train_data, num_boost_round = 100)
y_pred = model.predict(X_test)
y_pred_labels = (y_pred > 0.5).astype(int)
print(confusion_matrix(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[[35022   158]
 [ 5010 30310]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93     35180
           1       0.99      0.86      0.92     35320

    accuracy                           0.93     70500
   macro avg       0.93      0.93      0.93     70500
weighted avg       0.93      0.93      0.93     70500



Cross Validation

In [19]:
cv_results = lgb.cv(params, train_data, num_boost_round = 100, nfold = 5, stratified = True, seed = 42)
cv_results

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


{'auc-mean': [0.7867609966344674,
  0.8164456295930067,
  0.8284801082102644,
  0.8406737579583019,
  0.8510652391623893,
  0.8610338010391135,
  0.8700888537447033,
  0.880242972846007,
  0.8895046557485047,
  0.8964469372042088,
  0.9032657978661321,
  0.9120911659040052,
  0.9156435859382988,
  0.9213939974405374,
  0.9252405882889215,
  0.9288720710279577,
  0.9328242316895402,
  0.9353369179491372,
  0.9381244675169593,
  0.9400979169676196,
  0.9415653423832022,
  0.9427461933677236,
  0.9441348080526458,
  0.944828525007457,
  0.9461257028971211,
  0.9472033529959146,
  0.9479994144927026,
  0.9487477622169574,
  0.9493121854935203,
  0.949834335638338,
  0.9503268834435852,
  0.9508624230320561,
  0.951340300321796,
  0.9516544170918291,
  0.9519899726038451,
  0.9522194646591522,
  0.9525053753742772,
  0.9527520221267283,
  0.9530011027542196,
  0.9531952632214618,
  0.9534115418874866,
  0.9536201049617731,
  0.9538167707188798,
  0.9539523203223957,
  0.9541542765415564,
  

### Fine Tuning LGBM With Grid Search

In [20]:
train_data = lgb.Dataset(X_train, label = y_train)
param_grid = {
    'objective': ['binary'],
    'metric': ['auc'],
    'max_depth': [2, 4, 8],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'bagging_freq': [0, 1] #,
    # 'l1': [0.01, 0.05, 0.1],
    # 'l2': [0.01, 0.05, 0.1],
}
model = lgb.LGBMClassifier()

In [21]:
early_stopping_rounds = 10
eval_set = [(X_test, y_test)]
grid_search = GridSearchCV(model, param_grid, scoring = 'roc_auc', cv = 5, verbose = 2, n_jobs = -1)
grid_search.fit(X_train, y_train, early_stopping_rounds = early_stopping_rounds, eval_set = eval_set, eval_metric = 'auc')
best_params = grid_search.best_params_
best_score = grid_search.best_score_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[1]	valid_0's auc: 0.778971
[2]	valid_0's auc: 0.787959
[3]	valid_0's auc: 0.818138
[4]	valid_0's auc: 0.832438
[5]	valid_0's auc: 0.84124
[6]	valid_0's auc: 0.852437
[7]	valid_0's auc: 0.857025
[8]	valid_0's auc: 0.865944
[9]	valid_0's auc: 0.872054
[10]	valid_0's auc: 0.882018
[11]	valid_0's auc: 0.886603
[12]	valid_0's auc: 0.893237
[13]	valid_0's auc: 0.901037
[14]	valid_0's auc: 0.905297
[15]	valid_0's auc: 0.908146
[16]	valid_0's auc: 0.912275
[17]	valid_0's auc: 0.91465
[18]	valid_0's auc: 0.920103
[19]	valid_0's auc: 0.922778
[20]	valid_0's auc: 0.924262
[21]	valid_0's auc: 0.926008
[22]	valid_0's auc: 0.927369
[23]	valid_0's auc: 0.928819
[24]	valid_0's auc: 0.931022
[25]	valid_0's auc: 0.933373
[26]	valid_0's auc: 0.935579
[27]	valid_0's auc: 0.936875
[28]	valid_0's auc: 0.938132
[29]	valid_0's auc: 0.939058
[30]	valid_0's auc: 0.940236
[31]	valid_0's auc: 0.941051
[32]	valid_0's auc: 0.941725
[33]	valid_0's auc: 0

In [22]:
best_params

{'bagging_freq': 0,
 'learning_rate': 0.1,
 'max_depth': 8,
 'metric': 'auc',
 'n_estimators': 200,
 'objective': 'binary'}