### 0. Data Load

In [4]:
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold , GroupKFold

import lightgbm as lgb

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import display

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV


pd.set_option('display.max_columns', 200)

In [5]:
df = pd.read_pickle('/home/work/toy-project/data/lgb_selected.pkl')

#### Train (Before Year 2018) & Test (Year 2018) for Prediction

In [6]:
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['issue_year'] = df['issue_d'].dt.year

In [7]:
train = df[df['issue_year'] != 2018]
test = df[df['issue_year'] == 2018]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

answer = test['loan_status']

In [8]:
# Load Data with selected features
X = train.copy()
# Labels
y = train['loan_status']

# Remove Labels from Dataframe
X.drop(['loan_status'], axis = 1, inplace = True)
# Final Data Shapes
print(X.shape)
print(y.shape)

(1314290, 63)
(1314290,)


#### Train(Before year 2017) & Test(Year 2018) for Optimization

In [9]:
ttrain = train[train['issue_year'] != 2017]
ttest = train[train['issue_year'] == 2017]

In [10]:
X_train = ttrain.copy()
y_train = ttrain['loan_status']
X_train.drop(['loan_status'], axis = 1, inplace = True)

X_test = ttest.copy()
y_test = ttest['loan_status']
X_test.drop(['loan_status'], axis = 1, inplace = True)

In [11]:
remove_features = ['earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d',
                  'initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee',
                   'recoveries','collection_recovery_fee','last_pymnt_amnt','last_fico_range_high','last_fico_range_low' , 'id', 'loan_status' , 'issue_year' ]
features  = [col for col in list(train) if col not in remove_features]

### Grid Search

#### (1) Initiate a model

In [12]:
params = {
    'application': 'binary',
    'boosting': 'gbdt', 
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'max_depth': -1,
    'max_bin': 510, 
    'lambda_l1': 5, 
    'lambda_l2': 10, 
    'metric' : 'binary_error',
    'subsample_for_bin': 200,
    'subsample': 1,
    'colsample_bytree': 0.8, 
    'min_split_gain': 0.5, 
    'min_child_weight': 1, 
    'min_child_samples': 5
}

mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'max_bin'])

In [13]:
gridParams = {
    'bagging_fraction': [0.6, 0.8],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [125, 255],
    'max_depth': [10,20],
    'reg_alpha' : [0.5,1]
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)

grid.fit(X_train[features], y_train, early_stopping_rounds=100, eval_metric='auc',
         eval_set=[(X_train[features], y_train), (X_test[features], y_test)])

print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 32 candidates, totalling 128 fits
[1]	valid_0's auc: 0.750068	valid_0's binary_logloss: 0.485556	valid_1's auc: 0.710997	valid_1's binary_logloss: 0.575477
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.752822	valid_0's binary_logloss: 0.47316	valid_1's auc: 0.71495	valid_1's binary_logloss: 0.562778
[3]	valid_0's auc: 0.754143	valid_0's binary_logloss: 0.463625	valid_1's auc: 0.717379	valid_1's binary_logloss: 0.553078
[4]	valid_0's auc: 0.755177	valid_0's binary_logloss: 0.455969	valid_1's auc: 0.718774	valid_1's binary_logloss: 0.545397
[5]	valid_0's auc: 0.755885	valid_0's binary_logloss: 0.449683	valid_1's auc: 0.719624	valid_1's binary_logloss: 0.539243
[6]	valid_0's auc: 0.756593	valid_0's binary_logloss: 0.44442	valid_1's auc: 0.720341	valid_1's binary_logloss: 0.534156
[7]	valid_0's auc: 0.757202	valid_0's binary_logloss: 0.439985	valid_1's auc: 0.720878	valid_1's binary_logloss: 0.529819
[8]	valid_0's auc: 0.7576

[67]	valid_0's auc: 0.77963	valid_0's binary_logloss: 0.401693	valid_1's auc: 0.741223	valid_1's binary_logloss: 0.496617
[68]	valid_0's auc: 0.779872	valid_0's binary_logloss: 0.401553	valid_1's auc: 0.741308	valid_1's binary_logloss: 0.496551
[69]	valid_0's auc: 0.780125	valid_0's binary_logloss: 0.401403	valid_1's auc: 0.741468	valid_1's binary_logloss: 0.49644
[70]	valid_0's auc: 0.780436	valid_0's binary_logloss: 0.401227	valid_1's auc: 0.741577	valid_1's binary_logloss: 0.496376
[71]	valid_0's auc: 0.78069	valid_0's binary_logloss: 0.401076	valid_1's auc: 0.741736	valid_1's binary_logloss: 0.496285
[72]	valid_0's auc: 0.780891	valid_0's binary_logloss: 0.400947	valid_1's auc: 0.741802	valid_1's binary_logloss: 0.496248
[73]	valid_0's auc: 0.781131	valid_0's binary_logloss: 0.40081	valid_1's auc: 0.741864	valid_1's binary_logloss: 0.496207
[74]	valid_0's auc: 0.78135	valid_0's binary_logloss: 0.400681	valid_1's auc: 0.74197	valid_1's binary_logloss: 0.496118
[75]	valid_0's auc: 0.

In [14]:
params['bagging_fraction'] = grid.best_params_['bagging_fraction']
params['learning_rate'] = grid.best_params_['learning_rate']
params['num_leaves'] = grid.best_params_['num_leaves']
params['max_depth'] = grid.best_params_['max_depth']
params['reg_alpha'] = grid.best_params_['reg_alpha']

### Prediction

In [15]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [17]:
def make_lgb_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=5):
    
    skf = KFold(n_splits=folds, random_state=SEED, shuffle=True)
    folds = 5
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0
    split_groups = train['issue_year']
    

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            categorical_feature=categorical_features,
            verbose_eval=200,
            early_stopping_rounds=100
        )
        
        y_pred_val = clf.predict(x_val)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | AUC Score: {roc_auc_score(y_val, y_pred_val)}")

        score += roc_auc_score(y_val, y_pred_val) / folds
        y_preds += clf.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC score = {score}")
    print(f"OOF AUC score = {roc_auc_score(y, y_oof)}")
    
    return y_oof, y_preds, clf

In [18]:
y_oof_lgb, y_preds_lgb, mdl = make_lgb_prediction(X, y, test, features, model_params=params)

Fold: 1
(1051432, 60) (262858, 60)
[LightGBM] [Info] Number of positive: 222028, number of negative: 829404
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1824
[LightGBM] [Info] Number of data points in the train set: 1051432, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211167 -> initscore=-1.317904
[LightGBM] [Info] Start training from score -1.317904
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_error: 0.177421	valid_1's binary_error: 0.180839
Fold 1 | AUC Score: 0.7675472548211888
Fold: 2
(1051432, 60) (262858, 60)
[LightGBM] [Info] Number of positive: 221880, number of negative: 829552
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 1051432, number of used fea

In [23]:
import joblib
joblib.dump(mdl, '/home/work/toy-project/model/lgbm_grid.pkl')

['/home/work/toy-project/model/lgbm_grid.pkl']