In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from copy import deepcopy
from skopt import BayesSearchCV
import time
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer



## PSS MLM 

In [2]:
# Load the processed training dataset

train = pd.read_csv( "../data/processed/pss_train.csv",  index_col=0)

# Split the dataset into features (X) and target variable (y)
y_train_original = train['Is_shunt']
X_train_original = train.drop(['Is_shunt'], axis=1)



# define KNN imputer
imputer = KNNImputer()
# fit on the X_train
imputer.fit(X_train_original)
# transform the X_train
X_train_scaled = imputer.transform(X_train_original)

# Split the unscaled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_original, y_train_original, test_size=0.10, shuffle=True, random_state= 123)

# Split the scaled data into training and validation sets
X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled = train_test_split(X_train_scaled, y_train_original, test_size=0.10, shuffle=True, random_state= 123)







## Hyperparameter tuning and training

This code optimizes an XGBoost Classifier using 10-fold Bayesian search, seeking the best combination of hyperparameters for optimal performance. The model is trained and evaluated using accuracy (ACC), F1 score, and ROC AUC.

### Bayesian search: 

Unlike random or grid search methods, Bayesian optimization keeps track of past evaluation results to construct a probabilistic model that maps hyperparameters to the probability of achieving a certain score on the objective function. In other words, it offers a more thorough exploration of the hyperparameter space compared to random search, and it is faster than grid search while being less likely to overfit.


$ arg \, min_{θ} (E[objective(θ)∣data])$

### XGBoost Model Training with Unscaled Data

In [6]:
# Initializing XGBoost Classifier with default parameters


xgbc = XGBClassifier(objective='binary:logistic',
                          booster='gbtree',
                          eval_metric='auc',
                          tree_method='hist',
                          grow_policy='lossguide')

# Training the XGBoost Classifier

xgbc.fit(X_train , y_train.values.ravel())

# Defining default parameters and the parameter grid for Bayesian search

default_params = {}
gparams = xgbc.get_params()

for key in gparams.keys():
    gp = gparams[key]
    default_params[key] = [gp]

# Deep copy the default parameters for potential modifications
params = deepcopy(default_params)

# Define the hyperparameter grid for hyperparameter tuning

param_grid = {'gamma': [2**i for i in range(-2, 8)],
              'learning_rate': [0.01,0.02, 0.025, 0.03, 0.04, 0.05, 0.06, 0.7,0.08],
              'max_depth': [2,5,6,7,8,9,10],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [2**i for i in range(-2, 8)],
              'reg_lambda': [2**i for i in range(-2, 8)]}

In [7]:
%%time

#start time
t0 = time.time()

#No. of jobs
bcvj = int(np.cumsum([len(x) for x in param_grid.values()])[-1])


# Initializing XGBoost Classifier with default parameters before Bayesian search
default_params_xgb = {}

for key in default_params.keys():
    default_params_xgb[key] = default_params[key][0]

#providing default parameters to xgbc model, before bayesian search cross-validation
xgbc = XGBClassifier(**default_params_xgb)


# Initializing Bayesian SearchCV with XGBoost Classifier
clf = BayesSearchCV(estimator=xgbc, search_spaces=param_grid, n_iter=bcvj, scoring='accuracy', cv=10, return_train_score=True, verbose=3)
clf.fit(X_train, y_train.values.ravel())

# Storing results in a DataFrame
df = pd.DataFrame(clf.cv_results_)
    
# Generating predictions for train and validation sets
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
    
# Calculating confusion matrices
cfm_train = confusion_matrix(y_train, train_predictions)
cfm_val = confusion_matrix(y_val, val_predictions)
    
# Calculating accuracy scores
accs_train = accuracy_score(y_train, train_predictions)
accs_val = accuracy_score(y_val, val_predictions)

    
# Calculating F1 scores
f1s_train_p1 = f1_score(y_train, train_predictions, pos_label=1)
f1s_train_p0 = f1_score(y_train, train_predictions, pos_label=0)
f1s_val_p1 = f1_score(y_val, val_predictions, pos_label=1)
f1s_val_p0 = f1_score(y_val, val_predictions, pos_label=0)

    
# Calculating Area Under the Receiver Operating Characteristic Curve
val_ras = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    
#best parameters
bp = clf.best_params_
results_dict = {}
    
# Storing computed values in a results dictionary
results_dict['xgbc_bcv'] = {'classifier': deepcopy(clf),
                            'cv_results': df.copy(),
                            'cfm_train': cfm_train,
                            'cfm_test': cfm_val,
                            'train_accuracy': accs_train,
                            'val_accuracy': accs_val,
                            'train F1-score label 1': f1s_train_p1,
                            'train F1-score label 0': f1s_train_p0,
                            'val F1-score label 1': f1s_val_p1,
                            'val F1-score label 0': f1s_val_p0,
                            'val roc auc score': val_ras,
                            'best_params': bp}

#stop time
t1 = time.time()

# Calculating elapsed time
bcvt = t1 - t0

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END gamma=0.1, learning_rate=0.03, max_depth=5, n_estimators=65, reg_alpha=0.2, reg_lambda=102.4;, score=(train=0.920, test=0.949) total time=   0.0s
[CV 2/10] END gamma=0.1, learning_rate=0.03, max_depth=5, n_estimators=65, reg_alpha=0.2, reg_lambda=102.4;, score=(train=0.927, test=0.846) total time=   0.0s
[CV 3/10] END gamma=0.1, learning_rate=0.03, max_depth=5, n_estimators=65, reg_alpha=0.2, reg_lambda=102.4;, score=(train=0.925, test=0.846) total time=   0.0s
[CV 4/10] END gamma=0.1, learning_rate=0.03, max_depth=5, n_estimators=65, reg_alpha=0.2, reg_lambda=102.4;, score=(train=0.929, test=0.821) total time=   0.0s
[CV 5/10] END gamma=0.1, learning_rate=0.03, max_depth=5, n_estimators=65, reg_alpha=0.2, reg_lambda=102.4;, score=(train=0.913, test=0.949) total time=   0.0s
[CV 6/10] END gamma=0.1, learning_rate=0.03, max_depth=5, n_estimators=65, reg_alpha=0.2, reg_lambda=102.4;, score=(train=0.913, test=0.885

In [27]:
# Print out accuracy, F1 score, and roc auc score
for key, value in list(results_dict['xgbc_bcv'].items())[4:-1]:
    if key != 'classifier':
        print(f"{key}: {np.round(value,4)}")

train_accuracy: 0.9885
val_accuracy: 0.931
train F1-score label 1: 0.9889
train F1-score label 0: 0.988
val F1-score label 1: 0.9333
val F1-score label 0: 0.9286
val roc auc score: 0.9752


In [28]:
# Save the best model

model = clf.best_estimator_

pickle.dump(model, open('../trained_models/pss_model.pkl', 'wb'))

### XGBoost Model Training with Scaled Data

In [3]:
# Initializing XGBoost Classifier with default parameters


xgbc = XGBClassifier(objective='binary:logistic',
                          booster='gbtree',
                          eval_metric='auc',
                          tree_method='hist',
                          grow_policy='lossguide')

# Training the XGBoost Classifier

xgbc.fit(X_train_scaled , y_train_scaled.values.ravel())

# Defining default parameters and the parameter grid for Bayesian search

default_params = {}
gparams = xgbc.get_params()

for key in gparams.keys():
    gp = gparams[key]
    default_params[key] = [gp]

# Deep copy the default parameters for potential modifications
params = deepcopy(default_params)

# Define the hyperparameter grid for hyperparameter tuning

param_grid = {'gamma': [2**i for i in range(-2, 8)],
              'learning_rate': [0.01,0.02, 0.025, 0.03, 0.04, 0.05, 0.06, 0.7,0.08],
              'max_depth': [2,5,6,7,8,9,10],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [2**i for i in range(-2, 8)],
              'reg_lambda': [2**i for i in range(-2, 8)]}

In [4]:
%%time

#start time
t0 = time.time()

#No. of jobs
bcvj = int(np.cumsum([len(x) for x in param_grid.values()])[-1])


# Initializing XGBoost Classifier with default parameters before Bayesian search
default_params_xgb = {}

for key in default_params.keys():
    default_params_xgb[key] = default_params[key][0]

#providing default parameters to xgbc model, before bayesian search cross-validation
xgbc = XGBClassifier(**default_params_xgb)

np.int = int
# Initializing Bayesian SearchCV with XGBoost Classifier
clf_scaled = BayesSearchCV(estimator=xgbc, search_spaces=param_grid, n_iter=bcvj, scoring='accuracy', cv=10, return_train_score=True, verbose=3)
clf_scaled.fit(X_train_scaled, y_train_scaled.values.ravel())

# Storing results in a DataFrame
df = pd.DataFrame(clf_scaled.cv_results_)
    
# Generating predictions for train and validation sets
train_predictions = clf_scaled.predict(X_train_scaled)
val_predictions = clf_scaled.predict(X_val_scaled)
    
# Calculating confusion matrices
cfm_train = confusion_matrix(y_train_scaled, train_predictions)
cfm_val = confusion_matrix(y_val_scaled, val_predictions)
    
# Calculating accuracy scores
accs_train = accuracy_score(y_train_scaled, train_predictions)
accs_val = accuracy_score(y_val_scaled, val_predictions)

    
# Calculating F1 scores
f1s_train_p1 = f1_score(y_train_scaled, train_predictions, pos_label=1)
f1s_train_p0 = f1_score(y_train_scaled, train_predictions, pos_label=0)
f1s_val_p1 = f1_score(y_val_scaled, val_predictions, pos_label=1)
f1s_val_p0 = f1_score(y_val_scaled, val_predictions, pos_label=0)

    
# Calculating Area Under the Receiver Operating Characteristic Curve
val_ras = roc_auc_score(y_val_scaled, clf_scaled.predict_proba(X_val)[:,1])
    
#best parameters
bp = clf_scaled.best_params_
results_dict = {}
    
# Storing computed values in a results dictionary
results_dict['xgbc_bcv'] = {'classifier': deepcopy(clf_scaled),
                            'cv_results': df.copy(),
                            'cfm_train': cfm_train,
                            'cfm_test': cfm_val,
                            'train_accuracy': accs_train,
                            'val_accuracy': accs_val,
                            'train F1-score label 1': f1s_train_p1,
                            'train F1-score label 0': f1s_train_p0,
                            'val F1-score label 1': f1s_val_p1,
                            'val F1-score label 0': f1s_val_p0,
                            'val roc auc score': val_ras,
                            'best_params': bp}

#stop time
t1 = time.time()

# Calculating elapsed time
bcvt = t1 - t0

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END gamma=16.0, learning_rate=0.05, max_depth=7, n_estimators=65, reg_alpha=8.0, reg_lambda=4.0;, score=(train=0.903, test=0.949) total time=   0.0s
[CV 2/10] END gamma=16.0, learning_rate=0.05, max_depth=7, n_estimators=65, reg_alpha=8.0, reg_lambda=4.0;, score=(train=0.899, test=0.859) total time=   0.0s
[CV 3/10] END gamma=16.0, learning_rate=0.05, max_depth=7, n_estimators=65, reg_alpha=8.0, reg_lambda=4.0;, score=(train=0.915, test=0.872) total time=   0.0s
[CV 4/10] END gamma=16.0, learning_rate=0.05, max_depth=7, n_estimators=65, reg_alpha=8.0, reg_lambda=4.0;, score=(train=0.912, test=0.808) total time=   0.0s
[CV 5/10] END gamma=16.0, learning_rate=0.05, max_depth=7, n_estimators=65, reg_alpha=8.0, reg_lambda=4.0;, score=(train=0.906, test=0.910) total time=   0.0s
[CV 6/10] END gamma=16.0, learning_rate=0.05, max_depth=7, n_estimators=65, reg_alpha=8.0, reg_lambda=4.0;, score=(train=0.898, test=0.897) tota

In [5]:
# Print out accuracy, F1 score, and roc auc score
for key, value in list(results_dict['xgbc_bcv'].items())[4:-1]:
    if key != 'classifier':
        print(f"{key}: {np.round(value,4)}")

train_accuracy: 0.991
val_accuracy: 0.908
train F1-score label 1: 0.9914
train F1-score label 0: 0.9907
val F1-score label 1: 0.9091
val F1-score label 0: 0.907
val roc auc score: 0.9709


In [6]:
# Save the best model

model = clf_scaled.best_estimator_

pickle.dump(model, open('../trained_models/pss_model_scaled.pkl', 'wb'))

## PSS SubCat MLM 

In [4]:
# Load the processed training dataset

train = pd.read_csv( "../data/processed/subcat_train.csv",  index_col=0)

# Split the dataset into features (X) and target variable (y)

y_train = train['Shunt_type']
X_train = train.drop(['Shunt_type'], axis=1)


# define KNN imputer
imputer = KNNImputer()
# fit on the X_train
imputer.fit(X_train)
# transform the X_train
X_train = imputer.transform(X_train)


# Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, shuffle=True, random_state= 123)

# Apply Synthetic Minority Over-sampling Technique (SMOTE) to balance the training dataset
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)


In [6]:
# Initializing XGBoost Classifier with default parameters

xgbc = XGBClassifier(objective='multi:softmax',
                          booster='gbtree',
                          num_class=4,
                          eval_metric='mlogloss',
                          tree_method='hist',
                          grow_policy='lossguide'
                          )

# Training the XGBoost Classifier

xgbc.fit(X_train , y_train.values.ravel())

# Defining default parameters and the parameter grid for Bayesian search

default_params = {}
gparams = xgbc.get_params()

for key in gparams.keys():
    gp = gparams[key]
    default_params[key] = [gp]

# Deep copy the default parameters for potential modifications

params = deepcopy(default_params)

# Define the hyperparameter grid for hyperparameter tuning

param_grid = {'gamma': [2**i for i in range(-2, 8)],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [2,5,6,7,8,9,10,11,12],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [2**i for i in range(-2, 8)],
              'reg_lambda': [2**i for i in range(-2, 8)]}

In [7]:
%%time

#start time
t0 = time.time()

#No. of jobs
bcvj = int(np.cumsum([len(x) for x in param_grid.values()])[-1])


# Initializing XGBoost Classifier with default parameters before Bayesian search
default_params_xgb = {}

for key in default_params.keys():
    default_params_xgb[key] = default_params[key][0]

#providing default parameters to xgbc model, before randomized search cross-validation
xgbc = XGBClassifier(**default_params_xgb)

# Initializing Bayesian SearchCV with XGBoost Classifier

clf = BayesSearchCV(estimator=xgbc, search_spaces=param_grid, n_iter=bcvj, scoring='f1_weighted', cv=10, return_train_score=True, verbose=3)
clf.fit(X_train, y_train.values.ravel())

# Storing results in a DataFrame
df = pd.DataFrame(clf.cv_results_)
    
# Generating predictions for train and validation sets
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
    
# Calculating confusion matrices
cfm_train = confusion_matrix(y_train, train_predictions)
cfm_val = confusion_matrix(y_val, val_predictions)
    
# Calculating accuracy scores
accs_train = accuracy_score(y_train, train_predictions)
accs_val = accuracy_score(y_val, val_predictions)

    
# Calculating F1 scores
f1_train = f1_score(y_train, train_predictions, average='weighted')
f1_val = f1_score(y_val, val_predictions, average='weighted')

    
# Calculating Area Under the Receiver Operating Characteristic Curve
val_ras = roc_auc_score(y_val, clf.predict_proba(X_val), multi_class='ovo')
    
#best parameters
bp = clf.best_params_
results_dict = {}
    
# Storing computed values in a results dictionary
results_dict['xgbc_bcv'] = {'classifier': deepcopy(clf),
                            'cv_results': df.copy(),
                            'cfm_train': cfm_train,
                            'cfm_test': cfm_val,
                            'train_accuracy': accs_train,
                            'val_accuracy': accs_val,
                            'train F1-score': f1_train,
                            'val F1-score': f1_val,
                            'val roc auc score': val_ras,
                            'best_params': bp}

#stop time
t1 = time.time()

# Calculating elapsed time
bcvt = t1 - t0

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END gamma=0.8, learning_rate=0.300000012, max_depth=11, n_estimators=80, reg_alpha=102.4, reg_lambda=3.2;, score=(train=0.866, test=0.750) total time=   0.0s
[CV 2/10] END gamma=0.8, learning_rate=0.300000012, max_depth=11, n_estimators=80, reg_alpha=102.4, reg_lambda=3.2;, score=(train=0.845, test=0.771) total time=   0.0s
[CV 3/10] END gamma=0.8, learning_rate=0.300000012, max_depth=11, n_estimators=80, reg_alpha=102.4, reg_lambda=3.2;, score=(train=0.842, test=0.765) total time=   0.0s
[CV 4/10] END gamma=0.8, learning_rate=0.300000012, max_depth=11, n_estimators=80, reg_alpha=102.4, reg_lambda=3.2;, score=(train=0.846, test=0.809) total time=   0.0s
[CV 5/10] END gamma=0.8, learning_rate=0.300000012, max_depth=11, n_estimators=80, reg_alpha=102.4, reg_lambda=3.2;, score=(train=0.820, test=0.833) total time=   0.0s
[CV 6/10] END gamma=0.8, learning_rate=0.300000012, max_depth=11, n_estimators=80, reg_alpha=102.4,

In [8]:
# Print out accuracy, F1 score, and roc auc score
for key, value in list(results_dict['xgbc_bcv'].items())[4:-1]:
    if key != 'classifier':
        print(f"{key}: {np.round(value,4)}")

train_accuracy: 0.9994
val_accuracy: 0.8391
train F1-score: 0.9994
val F1-score: 0.8274
val roc auc score: 0.9396


In [9]:
# Save the best model

model = clf.best_estimator_

pickle.dump(model, open('../trained_models/subcat_model.pkl', 'wb'))