In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load data and data preprocessing

In [4]:
seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('ML_chapter7_dataset/final/train_final.csv')
df_test = pd.read_csv('ML_chapter7_dataset/final/test_final.csv')

X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)

In [5]:
X_train.shape, Y_train.shape

((50000, 145), (50000,))

In [6]:
# split data for five fold

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

# Algorithm

In [17]:
def get_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

# train

In [25]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

In [26]:
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

0-th model is training:
[LightGBM] [Info] Number of positive: 31851, number of negative: 8149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2583
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 141
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796275 -> initscore=1.363174
[LightGBM] [Info] Start training from score 1.363174
[1]	valid_0's binary_logloss: 0.448173
[2]	valid_0's binary_logloss: 0.405734
[3]	valid_0's binary_logloss: 0.372961
[4]	valid_0's binary_logloss: 0.34683
[5]	valid_0's binary_logloss: 0.325461
[6]	valid_0's binary_logloss: 0.307594
[7]	valid_0's binary_logloss: 0.292607
[8]	valid_0's binary_logloss: 0.279725
[9]	valid_0's binary_logloss: 0.268803
[10]	valid_0's binary_logloss: 0.259461
[11]	valid_0's binary_logloss: 0.251464
[12]	valid_0's binary_logloss: 0.244436
[13]	valid_0's binary_logloss: 0.23844
[14]	valid_0's 

# test

In [27]:
def test_model(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [28]:
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)

print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))

base: 0.91552, fine tuning: 0.91756
