In [1]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import xgboost as xgb

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore', category = UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read in Data
to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
x_train_balanced = train_balanced.drop(to_drop, axis = 1)
y_train_balanced = train_balanced[['loan_status']]

val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
x_val = val_final.drop(to_drop, axis = 1)
y_val = val_final[['loan_status']]
x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)

test_final = pd.read_csv('/Users/vinh/FS/thesis/data/test_final.csv')
x_test = test_final.drop(to_drop, axis = 1)
y_test = test_final[['loan_status']]

In [3]:
def create_final_results_df(target_true_values, model_prediction_dict):
    '''
    Creates dataframe that organizes result metrics across all models.

    Arguments:
        target_true_values: pd.DataFrame
            True target values.

        model_prediction_dict: dict
            Dictionary containing predictions from all models.

    Returns:
        Dataframe containing results across all models.
    '''
    results_df = pd.DataFrame()
    for k, v in model_prediction_dict.items():
        temp_df = pd.DataFrame({k: [accuracy_score(target_true_values, v),
                                    f1_score(target_true_values, v),
                                    precision_score(target_true_values, v),
                                    recall_score(target_true_values , v)]
        })
        results_df = pd.concat([results_df, temp_df], axis = 1)
                                     
    results_df = results_df.set_index(pd.Index(['Accuracy', 'F1-Score', 'Precision', 'Recall']))
    
    return results_df

In [4]:
tuned_model_predictions = {}

----
# XGBoost Balanced Train Hyperparameter Tuning

In [6]:
# ---- Optuna objective function ----
def xgb_balanced_objective(trial):
    clear_session()
    
    # Read in Data
    to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

    train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
    x_train_balanced = train_balanced.drop(to_drop, axis = 1)
    y_train_balanced = train_balanced[['loan_status']]

    val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
    x_val = val_final.drop(to_drop, axis = 1)
    y_val = val_final[['loan_status']]
    x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)
    
    # Optuna hyperparameter suggestions
    n_estimators = trial.suggest_int('n_estimators', 50, 100)
    max_depth = trial.suggest_int('max_depth', 3, 9)
    gamma = trial.suggest_float('gamma', 1e-8, 1.0, log = True)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-8, 1.0, log = True) # L1 regularization weight.
    reg_lambda = trial.suggest_float('reg_lambda', 1e-8, 1.0, log = True) # L2 regularization weight.
    subsample = trial.suggest_float('subsample', 0.2, 1.0) # sampling ratio for training data.
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 1.0) # sampling according to each tree.
    
    # Build model
    xgb_clf = xgb.XGBClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        reg_alpha = reg_alpha, 
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    # Fit model
    xgb_clf.fit(x_train_balanced, y_train_balanced,
                early_stopping_rounds = 10,
                eval_metric = 'auc',
                eval_set = [(x_val_early_stop, y_val_early_stop)],
                verbose = False
    )
    
    # Evaluate F1 score on a validation set
    pred = xgb_clf.predict(x_val_scoring)
    score = f1_score(y_val_scoring, pred)
    
    return score

In [7]:
# ---- Optuna study ----
xgb_balanced_study = optuna.create_study(study_name = 'xgb_balanced',
                                         storage = 'sqlite:///data/optuna_trials/xgb_balanced.db',
                                         load_if_exists = True,
                                         direction = 'maximize'
)
xgb_balanced_study.optimize(xgb_balanced_objective, n_trials = 1000)

In [47]:
load_xgb_balanced_study = optuna.load_study(study_name = 'xgb_balanced', storage = 'sqlite:///data/optuna_trials/xgb_balanced.db')
load_xgb_balanced_study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_gamma,params_max_depth,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.415506,2023-06-21 01:00:52.685773,2023-06-21 01:01:03.379036,0 days 00:00:10.693263,0.922459,1.057223e-06,9.0,87.0,1.711417e-08,2.344761e-07,0.308071,COMPLETE
1,1,0.422219,2023-06-21 01:01:03.385669,2023-06-21 01:01:16.164613,0 days 00:00:12.778944,0.771706,3.585277e-01,5.0,87.0,6.061983e-05,1.115195e-05,0.309576,COMPLETE
2,2,,2023-06-21 01:01:16.170426,2023-06-21 01:01:27.486283,0 days 00:00:11.315857,0.981597,1.323220e-02,6.0,69.0,8.198470e-07,1.325499e-03,0.693011,FAIL
3,3,0.419267,2023-06-21 01:05:12.869387,2023-06-21 01:05:23.415647,0 days 00:00:10.546260,0.562652,9.164846e-01,6.0,90.0,6.856409e-05,5.271152e-02,0.306637,COMPLETE
4,4,0.418716,2023-06-21 01:05:23.423168,2023-06-21 01:05:35.838848,0 days 00:00:12.415680,0.673458,1.324744e-08,8.0,52.0,1.842834e-07,2.554234e-05,0.473234,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,948,0.424697,2023-06-21 10:00:01.168150,2023-06-21 10:00:17.198878,0 days 00:00:16.030728,0.542010,1.697726e-04,5.0,99.0,2.729680e-02,2.537032e-05,0.696760,COMPLETE
949,949,0.425018,2023-06-21 10:00:17.207163,2023-06-21 10:00:33.018239,0 days 00:00:15.811076,0.558746,8.498591e-06,5.0,97.0,4.261154e-05,1.813775e-06,0.824079,COMPLETE
950,950,0.425661,2023-06-21 10:00:33.025202,2023-06-21 10:00:53.162964,0 days 00:00:20.137762,0.833319,3.765027e-02,5.0,100.0,3.018136e-05,1.712826e-08,0.950235,COMPLETE
951,951,0.424562,2023-06-21 10:00:53.171216,2023-06-21 10:01:09.094378,0 days 00:00:15.923162,0.704750,1.083044e-02,6.0,98.0,1.905463e-08,4.833549e-02,0.977194,COMPLETE


In [36]:
load_xgb_balanced_study.best_params

{'colsample_bytree': 0.5414369781235469,
 'gamma': 0.00011724653799472996,
 'max_depth': 5,
 'n_estimators': 99,
 'reg_alpha': 0.9735500996403061,
 'reg_lambda': 1.881692086718138e-07,
 'subsample': 0.9550961565694804}

#### ---- Testing optimal parameters model on test set ----

In [32]:
# Build model
xgb_clf = xgb.XGBClassifier(
    n_estimators = load_xgb_balanced_study.best_params['n_estimators'],
    max_depth = load_xgb_balanced_study.best_params['max_depth'],
    gamma = load_xgb_balanced_study.best_params['gamma'],
    reg_alpha = load_xgb_balanced_study.best_params['reg_alpha'], 
    reg_lambda = load_xgb_balanced_study.best_params['reg_lambda'],
    subsample = load_xgb_balanced_study.best_params['subsample'],
    colsample_bytree = load_xgb_balanced_study.best_params['colsample_bytree'],
    verbosity = 0,
    objective = 'binary:logistic',
    booster = 'gbtree',
    random_state = 7,
    n_jobs = -1
)

# Fit model
xgb_clf.fit(x_train_balanced, y_train_balanced,
            early_stopping_rounds = 10,
            eval_metric = 'auc',
            eval_set = [(x_val, y_val)],
            verbose = False
)

In [33]:
tuned_model_predictions['xgboost_balanced_tuned'] = xgb_clf.predict(x_test)

----
# ANN Balanced Train Hyperparameter Tuning

In [41]:
# ---- Optuna objective function ----
def ann_balanced_objective(trial):
    clear_session()
    
    # Read in Data
    to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

    train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
    x_train_balanced = train_balanced.drop(to_drop, axis = 1)
    y_train_balanced = train_balanced[['loan_status']]

    val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
    x_val = val_final.drop(to_drop, axis = 1)
    y_val = val_final[['loan_status']]
    x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)
    
    # Optuna hyperparameter suggestions
    hidden_layer_size = trial.suggest_int('hidden_layer_size', 10, 30)
    hidden_layer_amount = trial.suggest_int('hidden_layer_amount', 2, 4)
    hidden_layers = hidden_layer_amount * [hidden_layer_size]
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.4)
    batch_size = trial.suggest_int('batch_size', 16, 48)
    epochs = trial.suggest_int('epochs', 5, 20)
    
    # Build model
    input_layer = Input(shape = (x_train_balanced.shape[1], ))
    pointer_last_layer = input_layer

    for layer in hidden_layers:
        pointer_last_layer = Dense(layer, activation = 'relu')(pointer_last_layer)
        pointer_last_layer = Dropout(dropout_rate)(pointer_last_layer)
    
    predictions = Dense(1, activation = 'sigmoid')(pointer_last_layer)

    ann = Model(inputs = input_layer, outputs = predictions)
    ann.compile(optimizer = 'adam', loss = 'binary_crossentropy')

    ann_balanced = Model(inputs = input_layer, outputs = predictions)
    ann_balanced.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    
    # Fit model
    callback = EarlyStopping(monitor = 'val_loss', patience = 5)
    ann_balanced.fit(x = x_train_balanced, y = y_train_balanced,
                     validation_data = (x_val_early_stop, y_val_early_stop),
                     epochs = epochs,
                     batch_size = batch_size,
                     callbacks = [callback])
    
    # Evaluate F1 score on a validation set
    pred = ann_balanced.predict(x_val_scoring)
    pred = np.where(pred >= 0.5, 1, 0)
    score = f1_score(y_val_scoring, pred)
    
    return score

In [None]:
# ---- Optuna study ----
ann_balanced_study = optuna.create_study(study_name = 'ann_balanced',
                                         storage = 'sqlite:///data/optuna_trials/ann_balanced.db',
                                         load_if_exists = True,
                                         direction = 'maximize'
)
ann_balanced_study.optimize(ann_balanced_objective, n_trials = 1000)

In [49]:
load_ann_balanced_study = optuna.load_study(study_name = 'ann_balanced', storage = 'sqlite:///data/optuna_trials/ann_balanced.db')
load_ann_balanced_study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_batch_size,params_colsample_bytree,params_dropout_rate,params_epochs,params_gamma,params_hidden_layer_amount,params_hidden_layer_size,params_max_depth,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.422517,2023-06-26 23:26:29.480458,2023-06-26 23:26:40.281855,0 days 00:00:10.801397,,0.397768,,,2.637136e-03,,,7.0,70.0,5.102076e-05,6.752427e-01,0.468065,COMPLETE
1,1,0.420365,2023-06-26 23:26:40.291002,2023-06-26 23:26:53.009498,0 days 00:00:12.718496,,0.966543,,,4.886697e-07,,,9.0,62.0,2.868458e-06,5.988732e-07,0.731960,COMPLETE
2,2,0.420348,2023-06-26 23:26:53.015521,2023-06-26 23:27:03.076372,0 days 00:00:10.060851,,0.461291,,,3.418758e-03,,,7.0,98.0,4.362291e-05,2.812500e-01,0.458387,COMPLETE
3,3,0.423157,2023-06-26 23:27:03.082144,2023-06-26 23:27:16.649203,0 days 00:00:13.567059,,0.838163,,,1.257466e-03,,,6.0,57.0,1.505717e-08,3.281809e-02,0.961629,COMPLETE
4,4,0.422395,2023-06-26 23:27:16.655356,2023-06-26 23:27:28.474517,0 days 00:00:11.819161,,0.397016,,,5.129503e-08,,,4.0,91.0,2.194333e-05,4.790404e-08,0.384413,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,639,0.424329,2023-06-27 02:20:29.914578,2023-06-27 02:20:48.244393,0 days 00:00:18.329815,,0.826670,,,1.684750e-04,,,5.0,98.0,5.229108e-04,2.519125e-05,0.897476,COMPLETE
640,640,0.424981,2023-06-27 02:20:48.254643,2023-06-27 02:21:04.552602,0 days 00:00:16.297959,,0.801354,,,1.388558e-01,,,5.0,88.0,9.696928e-01,3.607269e-07,0.828518,COMPLETE
641,641,0.425139,2023-06-27 02:21:04.564547,2023-06-27 02:21:23.131103,0 days 00:00:18.566556,,0.917057,,,3.906803e-02,,,5.0,95.0,1.519966e-05,1.349996e-06,0.844118,COMPLETE
642,642,,2023-06-27 02:21:23.141177,2023-06-27 02:21:30.588495,0 days 00:00:07.447318,,0.863348,,,4.590368e-01,,,5.0,95.0,4.934122e-02,1.116327e-07,0.730411,FAIL


In [44]:
load_ann_balanced_study.best_params

{'colsample_bytree': 0.8976649315878238,
 'gamma': 0.000735620843836894,
 'max_depth': 5,
 'n_estimators': 89,
 'reg_alpha': 0.011417088381971122,
 'reg_lambda': 8.367074328002095e-07,
 'subsample': 0.9248379489369646}

#### ---- Testing optimal parameters model on test set ----

In [None]:
# Build model
hidden_layer_size = load_ann_balanced_study.best_params['hidden_layer_size']
hidden_layer_amount = load_ann_balanced_study.best_params['hidden_layer_amount']
hidden_layers = hidden_layer_amount * [hidden_layer_size]
dropout_rate = load_ann_balanced_study.best_params['dropout_rate']
batch_size = load_ann_balanced_study.best_params['batch_size']
epochs = load_ann_balanced_study.best_params['epochs']
    
input_layer = Input(shape = (x_train_balanced.shape[1], ))
pointer_last_layer = input_layer

for layer in hidden_layers:
    pointer_last_layer = Dense(layer, activation = 'relu')(pointer_last_layer)
    pointer_last_layer = Dropout(dropout_rate)(pointer_last_layer)

predictions = Dense(1, activation = 'sigmoid')(pointer_last_layer)

ann = Model(inputs = input_layer, outputs = predictions)
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy')

ann_balanced = Model(inputs = input_layer, outputs = predictions)
ann_balanced.compile(optimizer = 'adam', loss = 'binary_crossentropy')

    
# Fit model
callback = EarlyStopping(monitor = 'val_loss', patience = 5)
ann_balanced_history = ann_balanced.fit(x = x_train_balanced, y = y_train_balanced,
                                        validation_data = (x_val, y_val),
                                        epochs = epochs,
                                        batch_size = batch_size,
                                        callbacks = [callback])

In [None]:
ann_balanced_tuned_pred = ann_balanced_history.model.predict(x_test)
ann_balanced_tuned_pred = np.where(ann_balanced_tuned_pred >= 0.5, 1, 0)
tuned_model_predictions['ann_balanced_tuned'] = ann_balanced_tuned_pred

----
# Tuned Results

In [None]:
create_final_results_df(y_test, tuned_model_predictions)

Unnamed: 0,xgboost_balanced_tuned
Accuracy,0.64649
F1-Score,0.426577
Precision,0.312017
Recall,0.674068
