## 0. Data Load

In [7]:
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb

from IPython.display import display

from sklearn import metrics
from sklearn.model_selection import (train_test_split, GridSearchCV)

pd.set_option('display.max_columns', 200)

In [9]:
df = pd.read_csv('/content/drive/MyDrive/LendingClub_ym/lgb_selected_default.csv')
df = df.loc[df['loan_status'] !=' Current']
train, test = train_test_split(df,test_size = 0.2, random_state = 2020)
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

## Grid Search

### (1) Initiate a model

In [13]:
params = {
    'application': 'binary', 
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', 
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'max_depth': -1,
    'max_bin': 510, 
    'lambda_l1': 5, 
    'lambda_l2': 10, 
    'metric' : 'binary_error',
    'subsample_for_bin': 200,
    'subsample': 1,
    'colsample_bytree': 0.8, 
    'min_split_gain': 0.5, 
    'min_child_weight': 1, 
    'min_child_samples': 5
}

# Initiate classifier to use
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'max_bin'])

### (2) Grid Search

In [19]:
gridParams = {
    'bagging_fraction': [0.6, 0.8],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [125, 255],
    'max_depth': [10,20],
    'reg_alpha' : [0.5,1]
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)

grid.fit(train, y_train)

print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 32 candidates, totalling 128 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed: 110.3min finished


{'bagging_fraction': 0.6, 'learning_rate': 0.1, 'max_depth': 20, 'num_leaves': 255, 'reg_alpha': 1}
0.9918762929763447


In [14]:
gridParams = {
    'bagging_fraction': [0.6, 0.8],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [125, 255],
    'max_depth': [10,20],
    'reg_alpha' : [0.5,1]
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)

grid.fit(train, y_train)

print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 32 candidates, totalling 128 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed: 108.6min finished


{'bagging_fraction': 0.6, 'learning_rate': 0.1, 'max_depth': 20, 'num_leaves': 255, 'reg_alpha': 1}
0.9918762929763447


In [15]:
params['bagging_fraction'] = grid.best_params_['bagging_fraction']
params['learning_rate'] = grid.best_params_['learning_rate']
params['num_leaves'] = grid.best_params_['num_leaves']
params['max_depth'] = grid.best_params_['max_depth']
params['reg_alpha'] = grid.best_params_['reg_alpha']

### (3) Prediction

In [11]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [12]:
def make_lgb_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=5):
    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) 
        return 'f1', f1_score(y_true, y_hat, average='binary'), True
    
    skf = KFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            categorical_feature=categorical_features,
            verbose_eval=200,
            early_stopping_rounds=100,
            feval=lgb_f1_score
        )

        feature_importance[f'fold_{fold+1}'] = clf.feature_importance()

        y_pred_val = clf.predict(x_val)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [10]:
remove_features = ['id','loan_status']
features = [col for col in list(train) if col not in remove_features]

In [16]:
y_oof_lgb, y_preds_lgb, fi_lgb = make_lgb_prediction(train, y_train, test, features, model_params=params)

Fold: 1
(1446827, 41) (361707, 41)
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_error: 0.00726072	training's f1: 0.995839	valid_1's binary_error: 0.00855665	valid_1's f1: 0.995101
Fold 1 | F1 Score: 0.991443350557219
Fold: 2
(1446827, 41) (361707, 41)
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_error: 0.00743835	training's f1: 0.995738	valid_1's binary_error: 0.0085373	valid_1's f1: 0.995113
Fold 2 | F1 Score: 0.9914627032377035
Fold: 3
(1446827, 41) (361707, 41)
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_error: 0.00733467	training's f1: 0.995797	valid_1's binary_error: 0.00870041	valid_1's f1: 0.995018
Fold 3 | F1 Score: 0.9912995877879057
Fold: 4
(1446827, 41) (361707, 41)
Training until validation scores don't impr