In [5]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import xgboost as xgb

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore', category = UserWarning)

In [6]:
# Read in Data
to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
x_train_balanced = train_balanced.drop(to_drop, axis = 1)
y_train_balanced = train_balanced[['loan_status']]

val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
x_val = val_final.drop(to_drop, axis = 1)
y_val = val_final[['loan_status']]
x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)

test_final = pd.read_csv('/Users/vinh/FS/thesis/data/test_final.csv')
x_test = test_final.drop(to_drop, axis = 1)
y_test = test_final[['loan_status']]

----
# XGBoost - Balanced Train

In [7]:
# ---- Optuna objective function ----
def xgb_balanced_objective(trial):
    clear_session()
    
    # Read in Data
    to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

    train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
    x_train_balanced = train_balanced.drop(to_drop, axis = 1)
    y_train_balanced = train_balanced[['loan_status']]

    val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
    x_val = val_final.drop(to_drop, axis = 1)
    y_val = val_final[['loan_status']]
    x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)
    
    # Optuna hyperparameter suggestions
    n_estimators = trial.suggest_int('n_estimators', 50, 100)
    max_depth = trial.suggest_int('max_depth', 3, 9)
    gamma = trial.suggest_float('gamma', 1e-8, 1.0, log = True)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-8, 1.0, log = True) # L1 regularization weight.
    reg_lambda = trial.suggest_float('reg_lambda', 1e-8, 1.0, log = True) # L2 regularization weight.
    subsample = trial.suggest_float('subsample', 0.2, 1.0) # sampling ratio for training data.
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 1.0) # sampling according to each tree.
    
    # Build model
    xgb_clf = xgb.XGBClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        reg_alpha = reg_alpha, 
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    # Fit model
    xgb_clf.fit(x_train_balanced, y_train_balanced,
                early_stopping_rounds = 10,
                eval_metric = 'auc',
                eval_set = [(x_val_early_stop, y_val_early_stop)],
                verbose = False
    )
    
    # Evaluate F1 score on a validation set
    pred = xgb_clf.predict(x_val_scoring)
    score = f1_score(y_val_scoring, pred)
    
    return score

In [None]:
# ---- Optuna study ----
xgb_balanced_study = optuna.create_study(study_name = 'xgb_balanced',
                                         storage = 'sqlite:///data/optuna_trials/xgb_balanced.db',
                                         load_if_exists = True,
                                         direction = 'maximize'
)
xgb_balanced_study.optimize(xgb_balanced_objective, n_trials = 1000)

In [9]:
load_xgb_balanced_study = optuna.load_study(study_name = 'xgb_balanced', storage = 'sqlite:///data/optuna_trials/xgb_balanced.db')
load_xgb_balanced_study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_gamma,params_max_depth,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.415506,2023-06-21 01:00:52.685773,2023-06-21 01:01:03.379036,0 days 00:00:10.693263,0.922459,1.057223e-06,9,87,1.711417e-08,2.344761e-07,0.308071,COMPLETE
1,1,0.422219,2023-06-21 01:01:03.385669,2023-06-21 01:01:16.164613,0 days 00:00:12.778944,0.771706,0.3585277,5,87,6.061983e-05,1.115195e-05,0.309576,COMPLETE
2,2,,2023-06-21 01:01:16.170426,2023-06-21 01:01:27.486283,0 days 00:00:11.315857,0.981597,0.0132322,6,69,8.19847e-07,0.001325499,0.693011,FAIL
3,3,0.419267,2023-06-21 01:05:12.869387,2023-06-21 01:05:23.415647,0 days 00:00:10.546260,0.562652,0.9164846,6,90,6.856409e-05,0.05271152,0.306637,COMPLETE
4,4,0.418716,2023-06-21 01:05:23.423168,2023-06-21 01:05:35.838848,0 days 00:00:12.415680,0.673458,1.324744e-08,8,52,1.842834e-07,2.554234e-05,0.473234,COMPLETE
5,5,0.42303,2023-06-21 01:05:35.845329,2023-06-21 01:05:47.402103,0 days 00:00:11.556774,0.346432,1.795744e-06,6,98,0.03497662,4.683323e-07,0.635107,COMPLETE
6,6,0.420709,2023-06-21 01:05:47.408336,2023-06-21 01:05:58.690267,0 days 00:00:11.281931,0.473713,0.001048222,7,75,4.238265e-05,2.129569e-06,0.376021,COMPLETE
7,7,0.419066,2023-06-21 01:05:58.696066,2023-06-21 01:06:09.692787,0 days 00:00:10.996721,0.63905,0.007476099,9,69,8.855071e-08,0.1413731,0.572756,COMPLETE
8,8,0.421326,2023-06-21 01:06:09.699966,2023-06-21 01:06:17.343785,0 days 00:00:07.643819,0.307315,0.2479722,6,99,6.041687e-08,3.170051e-07,0.272443,COMPLETE
9,9,0.422009,2023-06-21 01:06:17.351399,2023-06-21 01:06:28.295888,0 days 00:00:10.944489,0.300144,6.464594e-07,4,83,5.766459e-06,0.0001561818,0.4485,COMPLETE
