In [28]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import xgboost as xgb

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore', category = UserWarning)

In [14]:
# Read in Data
to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
x_train_balanced = train_balanced.drop(to_drop, axis = 1)
y_train_balanced = train_balanced[['loan_status']]

val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
x_val = val_final.drop(to_drop, axis = 1)
y_val = val_final[['loan_status']]
x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)

test_final = pd.read_csv('/Users/vinh/FS/thesis/data/test_final.csv')
x_test = test_final.drop(to_drop, axis = 1)
y_test = test_final[['loan_status']]

In [30]:
def create_final_results_df(target_true_values, model_prediction_dict):
    '''
    Creates dataframe that organizes result metrics across all models.

    Arguments:
        target_true_values: pd.DataFrame
            True target values.

        model_prediction_dict: dict
            Dictionary containing predictions from all models.

    Returns:
        Dataframe containing results across all models.
    '''
    results_df = pd.DataFrame()
    for k, v in model_prediction_dict.items():
        temp_df = pd.DataFrame({k: [accuracy_score(target_true_values, v),
                                    f1_score(target_true_values, v),
                                    precision_score(target_true_values, v),
                                    recall_score(target_true_values , v)]
        })
        results_df = pd.concat([results_df, temp_df], axis = 1)
                                     
    results_df = results_df.set_index(pd.Index(['Accuracy', 'F1-Score', 'Precision', 'Recall']))
    
    return results_df

In [29]:
tuned_model_predictions = {}

----
# XGBoost Balanced Train Hyperparamter Tuning

In [16]:
# ---- Optuna objective function ----
def xgb_balanced_objective(trial):
    clear_session()
    
    # Read in Data
    to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

    train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
    x_train_balanced = train_balanced.drop(to_drop, axis = 1)
    y_train_balanced = train_balanced[['loan_status']]

    val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
    x_val = val_final.drop(to_drop, axis = 1)
    y_val = val_final[['loan_status']]
    x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)
    
    # Optuna hyperparameter suggestions
    n_estimators = trial.suggest_int('n_estimators', 50, 100)
    max_depth = trial.suggest_int('max_depth', 3, 9)
    gamma = trial.suggest_float('gamma', 1e-8, 1.0, log = True)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-8, 1.0, log = True) # L1 regularization weight.
    reg_lambda = trial.suggest_float('reg_lambda', 1e-8, 1.0, log = True) # L2 regularization weight.
    subsample = trial.suggest_float('subsample', 0.2, 1.0) # sampling ratio for training data.
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 1.0) # sampling according to each tree.
    
    # Build model
    xgb_clf = xgb.XGBClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        reg_alpha = reg_alpha, 
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    # Fit model
    xgb_clf.fit(x_train_balanced, y_train_balanced,
                early_stopping_rounds = 10,
                eval_metric = 'auc',
                eval_set = [(x_val_early_stop, y_val_early_stop)],
                verbose = False
    )
    
    # Evaluate F1 score on a validation set
    pred = xgb_clf.predict(x_val_scoring)
    score = f1_score(y_val_scoring, pred)
    
    return score

In [None]:
# ---- Optuna study ----
xgb_balanced_study = optuna.create_study(study_name = 'xgb_balanced',
                                         storage = 'sqlite:///data/optuna_trials/xgb_balanced.db',
                                         load_if_exists = True,
                                         direction = 'maximize'
)
xgb_balanced_study.optimize(xgb_balanced_objective, n_trials = 1000)

In [22]:
load_xgb_balanced_study = optuna.load_study(study_name = 'xgb_balanced', storage = 'sqlite:///data/optuna_trials/xgb_balanced.db')
load_xgb_balanced_study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_gamma,params_max_depth,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.415506,2023-06-21 01:00:52.685773,2023-06-21 01:01:03.379036,0 days 00:00:10.693263,0.922459,1.057223e-06,9.0,87.0,1.711417e-08,2.344761e-07,0.308071,COMPLETE
1,1,0.422219,2023-06-21 01:01:03.385669,2023-06-21 01:01:16.164613,0 days 00:00:12.778944,0.771706,3.585277e-01,5.0,87.0,6.061983e-05,1.115195e-05,0.309576,COMPLETE
2,2,,2023-06-21 01:01:16.170426,2023-06-21 01:01:27.486283,0 days 00:00:11.315857,0.981597,1.323220e-02,6.0,69.0,8.198470e-07,1.325499e-03,0.693011,FAIL
3,3,0.419267,2023-06-21 01:05:12.869387,2023-06-21 01:05:23.415647,0 days 00:00:10.546260,0.562652,9.164846e-01,6.0,90.0,6.856409e-05,5.271152e-02,0.306637,COMPLETE
4,4,0.418716,2023-06-21 01:05:23.423168,2023-06-21 01:05:35.838848,0 days 00:00:12.415680,0.673458,1.324744e-08,8.0,52.0,1.842834e-07,2.554234e-05,0.473234,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,948,0.424697,2023-06-21 10:00:01.168150,2023-06-21 10:00:17.198878,0 days 00:00:16.030728,0.542010,1.697726e-04,5.0,99.0,2.729680e-02,2.537032e-05,0.696760,COMPLETE
949,949,0.425018,2023-06-21 10:00:17.207163,2023-06-21 10:00:33.018239,0 days 00:00:15.811076,0.558746,8.498591e-06,5.0,97.0,4.261154e-05,1.813775e-06,0.824079,COMPLETE
950,950,0.425661,2023-06-21 10:00:33.025202,2023-06-21 10:00:53.162964,0 days 00:00:20.137762,0.833319,3.765027e-02,5.0,100.0,3.018136e-05,1.712826e-08,0.950235,COMPLETE
951,951,0.424562,2023-06-21 10:00:53.171216,2023-06-21 10:01:09.094378,0 days 00:00:15.923162,0.704750,1.083044e-02,6.0,98.0,1.905463e-08,4.833549e-02,0.977194,COMPLETE


In [27]:
load_xgb_balanced_study.best_params['n_estimators']

99

In [36]:
load_xgb_balanced_study.best_params

{'colsample_bytree': 0.5414369781235469,
 'gamma': 0.00011724653799472996,
 'max_depth': 5,
 'n_estimators': 99,
 'reg_alpha': 0.9735500996403061,
 'reg_lambda': 1.881692086718138e-07,
 'subsample': 0.9550961565694804}

#### ---- Testing optimal parameters model on test set ----

In [32]:
# Build model
xgb_clf = xgb.XGBClassifier(
    n_estimators = load_xgb_balanced_study.best_params['n_estimators'],
    max_depth = load_xgb_balanced_study.best_params['max_depth'],
    gamma = load_xgb_balanced_study.best_params['gamma'],
    reg_alpha = load_xgb_balanced_study.best_params['reg_alpha'], 
    reg_lambda = load_xgb_balanced_study.best_params['reg_lambda'],
    subsample = load_xgb_balanced_study.best_params['subsample'],
    colsample_bytree = load_xgb_balanced_study.best_params['colsample_bytree'],
    verbosity = 0,
    objective = 'binary:logistic',
    booster = 'gbtree',
    random_state = 7,
    n_jobs = -1
)

# Fit model
xgb_clf.fit(x_train_balanced, y_train_balanced,
            early_stopping_rounds = 10,
            eval_metric = 'auc',
            eval_set = [(x_val, y_val)],
            verbose = False
)

In [33]:
tuned_model_predictions['xgboost_balanced_tuned'] = xgb_clf.predict(x_test)

----
# Tuned Results

In [35]:
create_final_results_df(y_test, tuned_model_predictions)

Unnamed: 0,xgboost_balanced_tuned
Accuracy,0.64649
F1-Score,0.426577
Precision,0.312017
Recall,0.674068
