In [2]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import xgboost as xgb

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore', category = UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read in Data
to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
x_train_balanced = train_balanced.drop(to_drop, axis = 1)
y_train_balanced = train_balanced[['loan_status']]

val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
x_val = val_final.drop(to_drop, axis = 1)
y_val = val_final[['loan_status']]
x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)

test_final = pd.read_csv('/Users/vinh/FS/thesis/data/test_final.csv')
x_test = test_final.drop(to_drop, axis = 1)
y_test = test_final[['loan_status']]

In [4]:
def create_final_results_df(target_true_values, model_prediction_dict):
    '''
    Creates dataframe that organizes result metrics across all models.

    Arguments:
        target_true_values: pd.DataFrame
            True target values.

        model_prediction_dict: dict
            Dictionary containing predictions from all models.

    Returns:
        Dataframe containing results across all models.
    '''
    results_df = pd.DataFrame()
    for k, v in model_prediction_dict.items():
        temp_df = pd.DataFrame({k: [accuracy_score(target_true_values, v),
                                    f1_score(target_true_values, v),
                                    precision_score(target_true_values, v),
                                    recall_score(target_true_values , v)]
        })
        results_df = pd.concat([results_df, temp_df], axis = 1)
                                     
    results_df = results_df.set_index(pd.Index(['Accuracy', 'F1-Score', 'Precision', 'Recall']))
    
    return results_df

In [5]:
tuned_model_predictions = {}

----
# XGBoost Balanced Train Hyperparameter Tuning

In [6]:
# ---- Optuna objective function ----
def xgb_balanced_objective(trial):
    clear_session()
    
    # Read in Data
    to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

    train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
    x_train_balanced = train_balanced.drop(to_drop, axis = 1)
    y_train_balanced = train_balanced[['loan_status']]

    val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
    x_val = val_final.drop(to_drop, axis = 1)
    y_val = val_final[['loan_status']]
    x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)
    
    # Optuna hyperparameter suggestions
    n_estimators = trial.suggest_int('n_estimators', 50, 100)
    max_depth = trial.suggest_int('max_depth', 3, 9)
    gamma = trial.suggest_float('gamma', 1e-8, 1.0, log = True)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-8, 1.0, log = True) # L1 regularization weight.
    reg_lambda = trial.suggest_float('reg_lambda', 1e-8, 1.0, log = True) # L2 regularization weight.
    subsample = trial.suggest_float('subsample', 0.2, 1.0) # sampling ratio for training data.
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 1.0) # sampling according to each tree.
    
    # Build model
    xgb_clf = xgb.XGBClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        reg_alpha = reg_alpha, 
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    # Fit model
    xgb_clf.fit(x_train_balanced, y_train_balanced,
                early_stopping_rounds = 10,
                eval_metric = 'auc',
                eval_set = [(x_val_early_stop, y_val_early_stop)],
                verbose = False
    )
    
    # Evaluate F1 score on a validation set
    pred = xgb_clf.predict(x_val_scoring)
    score = f1_score(y_val_scoring, pred)
    
    return score

In [7]:
# ---- Optuna study ----
xgb_balanced_study = optuna.create_study(study_name = 'xgb_balanced',
                                         storage = 'sqlite:///data/optuna_trials/xgb_balanced.db',
                                         load_if_exists = True,
                                         direction = 'maximize'
)
xgb_balanced_study.optimize(xgb_balanced_objective, n_trials = 1000)

In [7]:
load_xgb_balanced_study = optuna.load_study(study_name = 'xgb_balanced', storage = 'sqlite:///data/optuna_trials/xgb_balanced.db')
load_xgb_balanced_study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_gamma,params_max_depth,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.426464,2023-06-27 04:09:29.684552,2023-06-27 04:09:39.900140,0 days 00:00:10.215588,0.272047,2.090141e-03,8,87,5.118262e-08,4.296916e-05,0.751606,COMPLETE
1,1,0.428545,2023-06-27 04:09:39.906675,2023-06-27 04:09:50.387722,0 days 00:00:10.481047,0.643190,3.103960e-01,6,50,7.452729e-03,4.145269e-07,0.957249,COMPLETE
2,2,0.428531,2023-06-27 04:09:50.394458,2023-06-27 04:10:00.113134,0 days 00:00:09.718676,0.882833,5.663288e-06,4,54,1.699173e-01,1.646163e-02,0.862509,COMPLETE
3,3,0.426839,2023-06-27 04:10:00.120800,2023-06-27 04:10:12.365392,0 days 00:00:12.244592,0.410891,6.292376e-05,9,51,9.188732e-02,1.323189e-08,0.821752,COMPLETE
4,4,0.427723,2023-06-27 04:10:12.372362,2023-06-27 04:10:24.016908,0 days 00:00:11.644546,0.381242,5.812479e-03,7,93,2.710437e-06,5.715710e-01,0.855407,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,0.429901,2023-06-27 07:51:55.775718,2023-06-27 07:52:09.219124,0 days 00:00:13.443406,0.570572,6.996957e-05,5,84,1.672402e-07,7.834340e-06,0.788932,COMPLETE
996,996,0.430551,2023-06-27 07:52:09.228999,2023-06-27 07:52:23.314543,0 days 00:00:14.085544,0.588980,7.232008e-02,5,95,5.652292e-07,2.262151e-04,0.999631,COMPLETE
997,997,0.431201,2023-06-27 07:52:23.327785,2023-06-27 07:52:36.996926,0 days 00:00:13.669141,0.646159,3.504050e-07,5,87,2.705888e-01,2.584765e-01,0.973407,COMPLETE
998,998,0.430046,2023-06-27 07:52:37.006731,2023-06-27 07:52:50.648925,0 days 00:00:13.642194,0.644777,5.672311e-07,5,88,2.697373e-01,2.783055e-01,0.958214,COMPLETE


In [8]:
load_xgb_balanced_study.best_params

{'colsample_bytree': 0.672719740846669,
 'gamma': 1.6323381093108538e-06,
 'max_depth': 6,
 'n_estimators': 91,
 'reg_alpha': 0.16790562748458127,
 'reg_lambda': 1.50865612415149e-06,
 'subsample': 0.9988846052233982}

#### ---- Testing optimal parameters model on test set ----

In [9]:
# Build model
xgb_clf = xgb.XGBClassifier(
    n_estimators = load_xgb_balanced_study.best_params['n_estimators'],
    max_depth = load_xgb_balanced_study.best_params['max_depth'],
    gamma = load_xgb_balanced_study.best_params['gamma'],
    reg_alpha = load_xgb_balanced_study.best_params['reg_alpha'], 
    reg_lambda = load_xgb_balanced_study.best_params['reg_lambda'],
    subsample = load_xgb_balanced_study.best_params['subsample'],
    colsample_bytree = load_xgb_balanced_study.best_params['colsample_bytree'],
    verbosity = 0,
    objective = 'binary:logistic',
    booster = 'gbtree',
    random_state = 7,
    n_jobs = -1
)

# Fit model
xgb_clf.fit(x_train_balanced, y_train_balanced,
            early_stopping_rounds = 10,
            eval_metric = 'auc',
            eval_set = [(x_val, y_val)],
            verbose = False
)

In [10]:
tuned_model_predictions['xgboost_balanced_tuned'] = xgb_clf.predict(x_test)

----
# ANN Balanced Train Hyperparameter Tuning

In [11]:
# ---- Optuna objective function ----
def ann_balanced_objective(trial):
    clear_session()
    
    # Read in Data
    to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

    train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
    x_train_balanced = train_balanced.drop(to_drop, axis = 1)
    y_train_balanced = train_balanced[['loan_status']]

    val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
    x_val = val_final.drop(to_drop, axis = 1)
    y_val = val_final[['loan_status']]
    x_val_early_stop, x_val_scoring, y_val_early_stop, y_val_scoring = train_test_split(x_val, y_val, test_size = 0.50, random_state = 1337, stratify = y_val)
    
    # Optuna hyperparameter suggestions
    hidden_layer_size = trial.suggest_int('hidden_layer_size', 10, 30)
    hidden_layer_amount = trial.suggest_int('hidden_layer_amount', 2, 4)
    hidden_layers = hidden_layer_amount * [hidden_layer_size]
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.4)
    batch_size = trial.suggest_int('batch_size', 16, 48)
    epochs = trial.suggest_int('epochs', 5, 20)
    
    # Build model
    input_layer = Input(shape = (x_train_balanced.shape[1], ))
    pointer_last_layer = input_layer

    for layer in hidden_layers:
        pointer_last_layer = Dense(layer, activation = 'relu')(pointer_last_layer)
        pointer_last_layer = Dropout(dropout_rate)(pointer_last_layer)
    
    predictions = Dense(1, activation = 'sigmoid')(pointer_last_layer)

    ann = Model(inputs = input_layer, outputs = predictions)
    ann.compile(optimizer = 'adam', loss = 'binary_crossentropy')

    ann_balanced = Model(inputs = input_layer, outputs = predictions)
    ann_balanced.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    
    # Fit model
    callback = EarlyStopping(monitor = 'val_loss', patience = 5)
    ann_balanced.fit(x = x_train_balanced, y = y_train_balanced,
                     validation_data = (x_val_early_stop, y_val_early_stop),
                     epochs = epochs,
                     batch_size = batch_size,
                     callbacks = [callback])
    
    # Evaluate F1 score on a validation set
    pred = ann_balanced.predict(x_val_scoring)
    pred = np.where(pred >= 0.5, 1, 0)
    score = f1_score(y_val_scoring, pred)
    
    return score

In [None]:
# ---- Optuna study ----
ann_balanced_study = optuna.create_study(study_name = 'ann_balanced',
                                         storage = 'sqlite:///data/optuna_trials/ann_balanced.db',
                                         load_if_exists = True,
                                         direction = 'maximize'
)
ann_balanced_study.optimize(ann_balanced_objective, n_trials = 1000)

In [17]:
load_ann_balanced_study = optuna.load_study(study_name = 'ann_balanced', storage = 'sqlite:///data/optuna_trials/ann_balanced.db')
load_ann_balanced_study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_batch_size,params_dropout_rate,params_epochs,params_hidden_layer_amount,params_hidden_layer_size,state
0,0,0.411380,2023-06-27 11:40:41.612611,2023-06-27 11:52:39.623516,0 days 00:11:58.010905,48,0.258143,18,4,25,COMPLETE
1,1,0.425964,2023-06-27 11:52:39.630649,2023-06-27 12:05:57.566694,0 days 00:13:17.936045,40,0.113585,15,4,19,COMPLETE
2,2,0.088188,2023-06-27 12:05:57.573054,2023-06-27 12:22:03.551325,0 days 00:16:05.978271,31,0.346610,15,4,20,COMPLETE
3,3,0.417593,2023-06-27 12:22:03.557224,2023-06-27 12:38:36.081288,0 days 00:16:32.524064,22,0.352721,7,2,30,COMPLETE
4,4,0.423651,2023-06-27 12:38:36.087220,2023-06-27 12:55:57.056763,0 days 00:17:20.969543,18,0.125680,12,2,17,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...
66,66,0.422386,2023-06-28 09:08:09.379520,2023-06-28 09:24:44.879551,0 days 00:16:35.500031,27,0.146679,12,2,25,COMPLETE
67,67,0.426143,2023-06-28 09:24:44.886288,2023-06-28 09:35:37.605932,0 days 00:10:52.719644,36,0.124966,19,2,22,COMPLETE
68,68,0.425068,2023-06-28 09:35:37.612597,2023-06-28 10:05:12.608579,0 days 00:29:34.995982,36,0.141802,19,2,23,COMPLETE
69,69,0.423984,2023-06-28 10:05:12.615005,2023-06-28 10:21:51.542977,0 days 00:16:38.927972,34,0.172678,18,2,25,COMPLETE


In [13]:
load_ann_balanced_study.best_params

{'batch_size': 42,
 'dropout_rate': 0.13143473371119396,
 'epochs': 19,
 'hidden_layer_amount': 2,
 'hidden_layer_size': 24}

#### ---- Testing optimal parameters model on test set ----

In [14]:
# Build model
hidden_layer_size = load_ann_balanced_study.best_params['hidden_layer_size']
hidden_layer_amount = load_ann_balanced_study.best_params['hidden_layer_amount']
hidden_layers = hidden_layer_amount * [hidden_layer_size]
dropout_rate = load_ann_balanced_study.best_params['dropout_rate']
batch_size = load_ann_balanced_study.best_params['batch_size']
epochs = load_ann_balanced_study.best_params['epochs']
    
input_layer = Input(shape = (x_train_balanced.shape[1], ))
pointer_last_layer = input_layer

for layer in hidden_layers:
    pointer_last_layer = Dense(layer, activation = 'relu')(pointer_last_layer)
    pointer_last_layer = Dropout(dropout_rate)(pointer_last_layer)

predictions = Dense(1, activation = 'sigmoid')(pointer_last_layer)

ann = Model(inputs = input_layer, outputs = predictions)
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy')

ann_balanced = Model(inputs = input_layer, outputs = predictions)
ann_balanced.compile(optimizer = 'adam', loss = 'binary_crossentropy')

    
# Fit model
callback = EarlyStopping(monitor = 'val_loss', patience = 5)
ann_balanced_history = ann_balanced.fit(x = x_train_balanced, y = y_train_balanced,
                                        validation_data = (x_val, y_val),
                                        epochs = epochs,
                                        batch_size = batch_size,
                                        callbacks = [callback])

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Epoch 1/19


2023-06-28 10:35:42.941308: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19


In [15]:
ann_balanced_tuned_pred = ann_balanced_history.model.predict(x_test)
ann_balanced_tuned_pred = np.where(ann_balanced_tuned_pred >= 0.5, 1, 0)
tuned_model_predictions['ann_balanced_tuned'] = ann_balanced_tuned_pred



----
# Tuned Results

In [19]:
create_final_results_df(y_test, tuned_model_predictions).transpose()

Unnamed: 0,Accuracy,F1-Score,Precision,Recall
xgboost_balanced_tuned,0.645038,0.42801,0.314057,0.671747
ann_balanced_tuned,0.660241,0.424305,0.319022,0.633309
