In [16]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [19]:
stroke_df = pd.read_csv('stroke_dataset.csv')
stroke_df.head(10)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,Residence_type_Urban,smoking_status_m
0,67.0,0,1,228.69,36.6,1,1,1,0,1,0,1,1
1,80.0,0,1,105.92,32.5,1,1,1,0,1,0,0,0
2,49.0,0,0,171.23,34.4,1,0,1,0,1,0,1,2
3,79.0,1,0,174.12,24.0,1,0,1,0,0,1,0,0
4,81.0,0,0,186.21,29.0,1,1,1,0,1,0,1,1
5,74.0,1,1,70.09,27.4,1,1,1,0,1,0,0,0
6,69.0,0,0,94.39,22.8,1,0,0,0,1,0,1,0
7,78.0,0,0,58.57,24.2,1,0,1,0,1,0,1,3
8,81.0,1,0,80.43,29.7,1,0,1,0,1,0,0,0
9,61.0,0,1,120.46,36.8,1,0,1,0,0,0,0,2


## Splitting the data

In [22]:
#Splitting the dataset into test, train and validation
# train is 0.8, test is 0.2
# validation set is not needed since we are using StratifiedKFold() later on for Cross Validation

# feature only dataset: X
# target variable only dataset: y
X = stroke_df.drop(columns='stroke')
y = stroke_df['stroke']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=21, shuffle=True, stratify=y)

from sklearn.preprocessing import StandardScaler
#scale the feature data using z-score scaling 
# Fit the scaler only on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform test and validation data based on training data scaler
X_test_scaled = scaler.transform(X_test)

## Decision Tree


In [24]:
from sklearn.tree import DecisionTreeClassifier

# initiate DT clf with a random guess
clf = DecisionTreeClassifier(criterion='gini', 
                            max_depth=4, 
                            min_samples_split=1000,
                            min_samples_leaf=200,
                            class_weight={0: 1, 1: 20},
                            random_state=21)


clf.fit(X_train_scaled,y_train)

In [26]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, log_loss


y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)[:, 1]
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred_proba)

print(f"Decision Tree Classifier (Untuned) accuracy is {acc:.5f}")
print(f"Decision Tree Classifier (Untuned) AUC score is {roc:.5f}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("The test log-likelihood loss for the DT is", log_loss(y_test, y_pred_proba))

Decision Tree Classifier (Untuned) accuracy is 0.66354
Decision Tree Classifier (Untuned) AUC score is 0.81783
              precision    recall  f1-score   support

           0       0.99      0.65      0.79       919
           1       0.10      0.90      0.19        41

    accuracy                           0.66       960
   macro avg       0.55      0.78      0.49       960
weighted avg       0.96      0.66      0.76       960

[[600 319]
 [  4  37]]
The test log-likelihood loss for the DT is 0.5341309846485566


In [28]:
# cross validation to tune for hyperparameter
from sklearn.model_selection import GridSearchCV,cross_val_score, StratifiedKFold

# Set up Stratified K-Fold cross-validation (better than KFold for imbalanced data)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)


#initiate parameters
param_dt = {
    'max_depth': np.arange(1, 20, 1),  # Max depth of the tree, start from 1 to avoid zero depth
    'min_samples_split': np.arange(2, 2000, 100),  # Min samples required to split an internal node
    'min_samples_leaf': np.arange(1, 1000, 100),  # Min samples required to be at a leaf node
    'criterion': ['gini', 'entropy']  # Function to measure the quality of a split
}

#training tuned DT model again
grid_clf = GridSearchCV(estimator=clf,param_grid=param_dt, scoring='roc_auc', cv=kfold,n_jobs=-1)
grid_clf.fit(X_train_scaled,y_train)

In [29]:
clf_best_model = grid_clf.best_params_
print("Best hyperparamaters for the Decision Tree is ")
print(clf_best_model)

Best hyperparamaters for the Decision Tree is 
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 201, 'min_samples_split': 502}


In [30]:
# Checking generalisation of the tuned model: data_val
clf_tuned = grid_clf.best_estimator_
clf_tuned_cv = cross_val_score(clf_tuned, X_train_scaled,y_train, cv=kfold, scoring='roc_auc')
print("Cross validation AUC scores for 5 folds: ",clf_tuned_cv)
print("Mean cross validation AUC score =", round(np.mean(clf_tuned_cv),3))
print("++++++++++++++++++++++++++++++++++++++")

tuned_y_pred = clf_tuned.predict(X_test_scaled)
tuned_y_pred_proba = clf_tuned.predict_proba(X_test_scaled)[:, 1]

tuned_y_pred_train = clf_tuned.predict(X_train_scaled)
tuned_y_pred_train_proba = clf_tuned.predict_proba(X_train_scaled)[:, 1]

clf_tuned_acc = accuracy_score(y_test, tuned_y_pred)
print(f"Test dataset: Decision Tree Classifier (Tuned) accuracy is {clf_tuned_acc:.5f}")
print("\n")
clf_tuned_auc = roc_auc_score(y_test, tuned_y_pred_proba)
print(f"Test dataset: Decision Tree Classifier (Tuned) AUC is {clf_tuned_auc:.5f}")
clf_tuned_train_auc = roc_auc_score(y_train, tuned_y_pred_train_proba)
print(f"Train dataset: Decision Tree Classifier (Tuned) AUC is {clf_tuned_train_auc:.5f}")
print("======================================")

#Checking performance on test
print("Confusion matrix from the tuned DT is:")
print(confusion_matrix(y_test, tuned_y_pred))
# Calculate precision and recall
print(classification_report(y_test, tuned_y_pred))

print("======================================")
print("The log-likelihood loss for the DT (after tuning hyperparamaters) from test set is", log_loss(y_test, tuned_y_pred_proba))
print("The log-likelihood loss for the DT (after tuning hyperparamaters) from train set is", log_loss(y_train, tuned_y_pred_train_proba))


Cross validation AUC scores for 5 folds:  [0.84634096 0.86293548 0.84470668 0.78451408 0.82992734]
Mean cross validation AUC score = 0.834
++++++++++++++++++++++++++++++++++++++
Test dataset: Decision Tree Classifier (Tuned) accuracy is 0.78125


Test dataset: Decision Tree Classifier (Tuned) AUC is 0.83702
Train dataset: Decision Tree Classifier (Tuned) AUC is 0.85010
Confusion matrix from the tuned DT is:
[[718 201]
 [  9  32]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87       919
           1       0.14      0.78      0.23        41

    accuracy                           0.78       960
   macro avg       0.56      0.78      0.55       960
weighted avg       0.95      0.78      0.85       960

The log-likelihood loss for the DT (after tuning hyperparamaters) from test set is 0.5201818417450778
The log-likelihood loss for the DT (after tuning hyperparamaters) from train set is 0.4615697919015618


### new scoring metric during gridsearchcv: recall, f1-score and log-loss

In [31]:
from sklearn.metrics import make_scorer, fbeta_score, f1_score,  recall_score, f1_score, accuracy_score, roc_auc_score, classification_report, confusion_matrix, log_loss
# Define F2 score as a custom scoring metric
f2_scorer = make_scorer(fbeta_score, beta=2)

# Add 'f2' scoring to the list of metrics
scoring_metrics_DT = ['roc_auc','neg_log_loss', 'f1', 'recall', f2_scorer]

for scoring in scoring_metrics_DT:
    print(f"Model performance using scoring: {scoring}")
    
    # Using GridSearchCV with K-Fold cross-validation to find the best hyperparameters
    grid_dt = GridSearchCV(estimator=clf, param_grid=param_dt, 
                           scoring=scoring, cv=kfold, n_jobs=-1, verbose=1)
    grid_dt.fit(X_train_scaled, y_train)
    
    # Get the best model from the grid search
    tuned_dt = grid_dt.best_estimator_
    
    # Evaluate using cross-validation on the entire dataset
    tuned_dt_cv = cross_val_score(tuned_dt, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    print(f"Cross-validation {scoring} scores for {kfold.get_n_splits()} folds: ", tuned_dt_cv)
    print(f"Mean cross-validation {scoring} score =", round(np.mean(tuned_dt_cv), 3))
    print("\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\")
    
    # Predictions on train and test data
    tuned_dt_train_pred = tuned_dt.predict(X_train_scaled)
    tuned_dt_test_pred = tuned_dt.predict(X_test_scaled)
    
    # Accuracy
    tuned_dt_test_acc = accuracy_score(y_test, tuned_dt_test_pred)
    print(f"Test Accuracy using the best Decision Tree model: {tuned_dt_test_acc:.5f}")
    
    # AUC
    tuned_dt_test_auc = roc_auc_score(y_test, tuned_dt.predict_proba(X_test_scaled)[:, 1])
    print(f"Test AUC using the best Decision Tree model: {tuned_dt_test_auc:.5f}")

    # Generate and print the classification report
    print("Confusion matrix of the Test data:")
    print(confusion_matrix(y_test, tuned_dt_test_pred))
    print(classification_report(y_test, tuned_dt_test_pred))
    print("Confusion matrix of the Train data:")
    print(confusion_matrix(y_train, tuned_dt_train_pred))
    print(classification_report(y_train, tuned_dt_train_pred))
    
    #log-loss 
    print("The log-likelihood loss for the Decision Tree (after tuning hyperparameters) from test set is", log_loss(y_test, tuned_dt.predict_proba(X_test_scaled)))
    print("The log-likelihood loss for the Decision Tree (after tuning hyperparameters) from train set is", log_loss(y_train, tuned_dt.predict_proba(X_train_scaled)))
    
    # Train and test scores for the selected metric
    if scoring == 'roc_auc':
        train_score = roc_auc_score(y_train, tuned_dt.predict_proba(X_train_scaled)[:, 1])
        test_score = roc_auc_score(y_test, tuned_dt.predict_proba(X_test_scaled)[:, 1])
    elif scoring == 'f1':
        train_score = f1_score(y_train, tuned_dt_train_pred)
        test_score = f1_score(y_test, tuned_dt_test_pred)
    elif scoring == 'recall':
        train_score = recall_score(y_train, tuned_dt_train_pred)
        test_score = recall_score(y_test, tuned_dt_test_pred)
    elif scoring == f2_scorer:
        train_score = fbeta_score(y_train, tuned_dt_train_pred, beta=2)
        test_score = fbeta_score(y_test, tuned_dt_test_pred, beta=2)
    elif scoring == 'neg_log_loss':
        train_score = log_loss(y_train, tuned_dt.predict_proba(X_train_scaled))
        test_score = log_loss(y_test, tuned_dt.predict_proba(X_test_scaled))
    else:
        train_score, test_score = "N/A", "N/A"
        
    # Display the train and test scores for the selected scoring metric
    print(f"Train {scoring} score: {train_score:.5f}" if isinstance(train_score, (float, int)) else f"{scoring} score could not be calculated for train set.")
    print(f"Test {scoring} score: {test_score:.5f}" if isinstance(test_score, (float, int)) else f"{scoring} score could not be calculated for test set.")
    
    
    
    print("\n====================================\n")

Model performance using scoring: neg_log_loss
Fitting 5 folds for each of 7600 candidates, totalling 38000 fits
Cross-validation neg_log_loss scores for 5 folds:  [-0.46154643 -0.48289275 -0.46527275 -0.50924015 -0.46451978]
Mean cross-validation neg_log_loss score = -0.477
\\\\\\\\\\\\\\\\\\\\
Test Accuracy using the best Decision Tree model: 0.78125
Test AUC using the best Decision Tree model: 0.84207
Confusion matrix of the Test data:
[[718 201]
 [  9  32]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87       919
           1       0.14      0.78      0.23        41

    accuracy                           0.78       960
   macro avg       0.56      0.78      0.55       960
weighted avg       0.95      0.78      0.85       960

Confusion matrix of the Train data:
[[2893  779]
 [  38  128]]
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      3672
           1       0.14      0.77   

## Random Forest Classifier (regularisation of DT)

In [32]:
#tried GridSearchCV and it was taking too long, replaced with RandomizedSearchCV

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss, fbeta_score, f1_score, recall_score, roc_auc_score
import numpy as np

# Initiate RandomForestClassifier with class weights for imbalanced data
rf = RandomForestClassifier(random_state=21, class_weight={0: 1, 1: 20})

# Define the parameter grid for Random Forest
param_dt_rf = {
    'n_estimators': np.arange(50, 300, 25),  
    'max_depth': np.arange(1, 25, 5),  
    'min_samples_split': [2, 10, 20, 30],  
    'min_samples_leaf': np.arange(1, 20, 5),  
    'criterion': ['gini', 'entropy']  
}

# Define F2 score as a custom scoring metric
f2_scorer = make_scorer(fbeta_score, beta=2)

# List of scoring metrics
scoring_metrics = ['roc_auc', 'neg_log_loss', 'f1', 'recall', f2_scorer]

# Loop through different scoring metrics
for scoring in scoring_metrics:
    print(f"Model performance using scoring: {scoring}")
    
    # Using RandomizedSearchCV to find the best hyperparameters
    grid_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_dt_rf, 
                                 n_iter=100, scoring=scoring, cv=kfold, n_jobs=-1, verbose=1)
    grid_rf.fit(X_train_scaled, y_train)
    
    # Get the best model from the grid search
    tuned_rf = grid_rf.best_estimator_
    
    # Cross-validation scores for scoring metric
    tuned_rf_cv = cross_val_score(tuned_rf, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    print(f"Cross-validation {scoring} scores for {kfold.get_n_splits()} folds: ", tuned_rf_cv)
    print(f"Mean cross-validation {scoring} score =", round(np.mean(tuned_rf_cv), 3))
    print("..............................")

    # Predictions on train and test data
    tuned_rf_train_pred = tuned_rf.predict(X_train_scaled)
    tuned_rf_test_pred = tuned_rf.predict(X_test_scaled)
    
    # Accuracy
    tuned_rf_test_acc = accuracy_score(y_test, tuned_rf_test_pred)
    print(f"Test Accuracy using the best Random Forest model: {tuned_rf_test_acc:.5f}")
    
    # AUC
    tuned_rf_test_auc = roc_auc_score(y_test, tuned_rf.predict_proba(X_test_scaled)[:, 1])
    print(f"Test AUC using the best Random Forest model: {tuned_rf_test_auc:.5f}")

    # Generate and print the classification report
    print("Confusion matrix of the Test data:")
    print(confusion_matrix(y_test, tuned_rf_test_pred))
    print(classification_report(y_test, tuned_rf_test_pred))
    print("Confusion matrix of the Train data:")
    print(confusion_matrix(y_train, tuned_rf_train_pred))
    print(classification_report(y_train, tuned_rf_train_pred))
    
    # Log-loss 
    print("The log-likelihood loss for the Random Forest (after tuning hyperparameters) from test set is", log_loss(y_test, tuned_rf.predict_proba(X_test_scaled)))
    print("The log-likelihood loss for the Random Forest (after tuning hyperparameters) from train set is", log_loss(y_train, tuned_rf.predict_proba(X_train_scaled)))
    
    # Train and test scores for the selected metric
    if scoring == 'roc_auc':
        train_score = roc_auc_score(y_train, tuned_rf.predict_proba(X_train_scaled)[:, 1])
        test_score = roc_auc_score(y_test, tuned_rf.predict_proba(X_test_scaled)[:, 1])
    elif scoring == 'f1':
        train_score = f1_score(y_train, tuned_rf_train_pred)
        test_score = f1_score(y_test, tuned_rf_test_pred)
    elif scoring == 'recall':
        train_score = recall_score(y_train, tuned_rf_train_pred)
        test_score = recall_score(y_test, tuned_rf_test_pred)
    elif scoring == f2_scorer:
        train_score = fbeta_score(y_train, tuned_rf_train_pred, beta=2)
        test_score = fbeta_score(y_test, tuned_rf_test_pred, beta=2)
    elif scoring == 'neg_log_loss':
        train_score = log_loss(y_train, tuned_rf.predict_proba(X_train_scaled))
        test_score = log_loss(y_test, tuned_rf.predict_proba(X_test_scaled))
    else:
        train_score, test_score = "N/A", "N/A"
        
    # Display the train and test scores for the selected scoring metric
    print(f"Train {scoring} score: {train_score:.5f}" if isinstance(train_score, (float, int)) else f"{scoring} score could not be calculated for train set.")
    print(f"Test {scoring} score: {test_score:.5f}" if isinstance(test_score, (float, int)) else f"{scoring} score could not be calculated for test set.")
    
    print("\n====================================\n")


Model performance using scoring: roc_auc
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Cross-validation roc_auc scores for 5 folds:  [0.84028035 0.84836116 0.87542074 0.7897366  0.82614978]
Mean cross-validation roc_auc score = 0.836
..............................
Test Accuracy using the best Random Forest model: 0.80312
Test AUC using the best Random Forest model: 0.85756
Confusion matrix of the Test data:
[[740 179]
 [ 10  31]]
              precision    recall  f1-score   support

           0       0.99      0.81      0.89       919
           1       0.15      0.76      0.25        41

    accuracy                           0.80       960
   macro avg       0.57      0.78      0.57       960
weighted avg       0.95      0.80      0.86       960

Confusion matrix of the Train data:
[[3000  672]
 [  28  138]]
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      3672
           1       0.17      0.83      0.28       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train neg_log_loss score: 0.04375
Test neg_log_loss score: 0.17442


Model performance using scoring: f1
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Cross-validation f1 scores for 5 folds:  [0.24806202 0.32624113 0.29457364 0.20289855 0.23140496]
Mean cross-validation f1 score = 0.261
..............................
Test Accuracy using the best Random Forest model: 0.87187
Test AUC using the best Random Forest model: 0.86430
Confusion matrix of the Test data:
[[810 109]
 [ 14  27]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       919
           1       0.20      0.66      0.31        41

    accuracy                           0.87       960
   macro avg       0.59      0.77      0.62       960
weighted avg       0.95      0.87      0.90       960

Confusion matrix of the Train data:
[[3291  381]
 [  16  150]]
              precision    recall  f1-score   support

           0       1.00      0.90      0.94      3

# Gradient Boosting

In [35]:
# since gradient boosting doesn't have a class_weight variable, compute sample_weight variable 

from sklearn.utils.class_weight import compute_sample_weight

# Calculate sample weights based on class imbalance
sample_weights = compute_sample_weight(class_weight={0: 1, 1: 20}, y=y_train)
sample_weights

array([1., 1., 1., ..., 1., 1., 1.])

In [36]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, roc_auc_score, f1_score, recall_score, fbeta_score
import numpy as np

# Initialize the Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=21)

# Define the parameter grid for Gradient Boosting
param_dist_gb = {
    'n_estimators': [50, 100, 200],  # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],  # Shrinks contribution of each tree
    'max_depth': [3, 5, 10],  # Maximum depth of the trees
    'min_samples_split': [2, 10, 20],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 5, 10]  # Minimum number of samples required at a leaf node
}

# Define custom F2 scoring
f2_scorer = make_scorer(fbeta_score, beta=2)

# List of scoring metrics
scoring_metrics = ['roc_auc', 'neg_log_loss', 'f1', 'recall', f2_scorer]

# Loop through different scoring metrics
for scoring in scoring_metrics:
    print(f"Model performance using scoring: {scoring}")
    
    # Initialize RandomizedSearchCV with cross-validation
    random_search_gb = RandomizedSearchCV(estimator=gb, param_distributions=param_dist_gb, 
                                           n_iter=100, scoring=scoring, cv=kfold, n_jobs=-1, verbose=1, random_state=21)

    # Fit the model to the training data
    random_search_gb.fit(X_train_scaled, y_train, sample_weight=sample_weights)

    # Get the best model from the grid search
    best_gb_model = random_search_gb.best_estimator_
    print("Best Gradient Boosting hyperparameters found: ", random_search_gb.best_params_)
    
    # Perform cross-validation on the best model
    best_gb_model_cv = cross_val_score(best_gb_model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    print(f"Cross-validation {scoring} scores for {kfold.get_n_splits()} folds: ", best_gb_model_cv)
    print(f"Mean cross-validation {scoring} score =", round(np.mean(best_gb_model_cv), 3))
    print("...........................")

    # Predictions on test and train data
    y_test_pred_gb = best_gb_model.predict(X_test_scaled)
    y_test_pred_gb_proba = best_gb_model.predict_proba(X_test_scaled)[:, 1]
    y_train_pred_gb = best_gb_model.predict(X_train_scaled)
    y_train_pred_gb_proba = best_gb_model.predict_proba(X_train_scaled)[:, 1]

    # Accuracy
    gb_accuracy = accuracy_score(y_test, y_test_pred_gb)
    print(f"Test Accuracy using the best Gradient Boosting model: {gb_accuracy:.5f}")
    
    # AUC
    gb_auc = roc_auc_score(y_test, y_test_pred_gb_proba)
    print(f"Test AUC using the best Gradient Boosting model: {gb_auc:.5f}")

    # Generate and print the classification report
    print("Confusion matrix of the Test data:")
    print(confusion_matrix(y_test, y_test_pred_gb))
    print(classification_report(y_test, y_test_pred_gb))
    print("Confusion matrix of the Train data:")
    print(confusion_matrix(y_train, y_train_pred_gb))
    print(classification_report(y_train, y_train_pred_gb))

    # Log-loss
    print("The log-likelihood loss for GB (after tuning hyperparameters) from test set is", log_loss(y_test, y_test_pred_gb_proba))
    print("The log-likelihood loss for GB (after tuning hyperparameters) from train set is", log_loss(y_train, y_train_pred_gb_proba))

    # Train and test scores for the selected metric
    if scoring == 'roc_auc':
        train_score = roc_auc_score(y_train, y_train_pred_gb_proba)
        test_score = roc_auc_score(y_test, y_test_pred_gb_proba)
    elif scoring == 'f1':
        train_score = f1_score(y_train, y_train_pred_gb)
        test_score = f1_score(y_test, y_test_pred_gb)
    elif scoring == 'recall':
        train_score = recall_score(y_train, y_train_pred_gb)
        test_score = recall_score(y_test, y_test_pred_gb)
    elif scoring == f2_scorer:
        train_score = fbeta_score(y_train, y_train_pred_gb, beta=2)
        test_score = fbeta_score(y_test, y_test_pred_gb, beta=2)
    elif scoring == 'neg_log_loss':
        train_score = log_loss(y_train, y_train_pred_gb_proba)
        test_score = log_loss(y_test, y_test_pred_gb_proba)
    else:
        train_score, test_score = "N/A", "N/A"

    # Display the train and test scores for the selected scoring metric
    print(f"Train {scoring} score: {train_score:.5f}" if isinstance(train_score, (float, int)) else f"{scoring} score could not be calculated for train set.")
    print(f"Test {scoring} score: {test_score:.5f}" if isinstance(test_score, (float, int)) else f"{scoring} score could not be calculated for test set.")
    
    print("\n====================================\n")


Model performance using scoring: roc_auc
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Gradient Boosting hyperparameters found:  {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_depth': 3, 'learning_rate': 0.01}
Cross-validation roc_auc scores for 5 folds:  [0.83939394 0.8546073  0.84819282 0.77043597 0.83607877]
Mean cross-validation roc_auc score = 0.83
...........................
Test Accuracy using the best Gradient Boosting model: 0.74062
Test AUC using the best Gradient Boosting model: 0.84485
Confusion matrix of the Test data:
[[677 242]
 [  7  34]]
              precision    recall  f1-score   support

           0       0.99      0.74      0.84       919
           1       0.12      0.83      0.21        41

    accuracy                           0.74       960
   macro avg       0.56      0.78      0.53       960
weighted avg       0.95      0.74      0.82       960

Confusion matrix of the Train data:
[[2758  914]
 [  22  144]]

# XGBoost

In [38]:
import sys
!{sys.executable} -m pip install xgboost
import xgboost as xgb
from xgboost import XGBClassifier



In [39]:
# Initialize the model
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Train the model
xgb_clf.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = xgb_clf.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
train_y_pred = xgb_clf.predict(X_train_scaled)
print(log_loss(y_train,train_y_pred))
print(log_loss(y_test,y_pred))

Accuracy: 95.52%
0.02699748024615316
1.547052691011334


Parameters: { "use_label_encoder" } are not used.



In [41]:
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, roc_auc_score, f1_score, recall_score, fbeta_score
import numpy as np

# Calculate class weights
class_weights = compute_class_weight(class_weight={0: 1, 1: 20}, classes=np.array([0, 1]), y=y_train)
scale_pos_weight = class_weights[1] / class_weights[0]

# Define parameter grid for XGBoost
param_dist_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Define custom F2 scoring
f2_scorer = make_scorer(fbeta_score, beta=2)

# List of scoring metrics
scoring_metrics = ['neg_log_loss', 'roc_auc', 'f1', 'recall', f2_scorer]

# Loop through different scoring metrics
for scoring in scoring_metrics:
    print(f"Model performance using scoring: {scoring}")

    # Initialize the XGBClassifier with scale_pos_weight
    xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', 
                             scale_pos_weight=scale_pos_weight)

    # Initialize RandomizedSearchCV
    random_search_xgb = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_dist_xgb,
                                           n_iter=100, scoring=scoring, cv=kfold, n_jobs=-1, verbose=1, random_state=21)

    # Fit the model
    random_search_xgb.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_xgb_model = random_search_xgb.best_estimator_
    print("Best XGBoost hyperparameters found: ", random_search_xgb.best_params_)
    
    # Perform cross-validation on the best model
    best_xgb_model_cv = cross_val_score(best_xgb_model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    print(f"Cross-validation {scoring} scores for {kfold.get_n_splits()} folds: ", best_xgb_model_cv)
    print(f"Mean cross-validation {scoring} score =", round(np.mean(best_xgb_model_cv), 3))
    print("____________________")

    # Predict on the test and training data
    y_test_pred_xgb = best_xgb_model.predict(X_test_scaled)
    y_test_pred_xgb_proba = best_xgb_model.predict_proba(X_test_scaled)[:, 1]
    y_train_pred_xgb = best_xgb_model.predict(X_train_scaled)
    y_train_pred_xgb_proba = best_xgb_model.predict_proba(X_train_scaled)[:, 1]
    
    # Accuracy on test data
    xgb_accuracy = accuracy_score(y_test, y_test_pred_xgb)
    print(f"Test Accuracy using the best XGBoost model: {xgb_accuracy:.5f}")
    
    # AUC on test data
    xgb_auc = roc_auc_score(y_test, y_test_pred_xgb_proba)
    print(f"Test AUC using the best XGBoost model: {xgb_auc:.5f}")

    # Classification report and confusion matrix
    print("Confusion matrix of the Test data:")
    print(confusion_matrix(y_test, y_test_pred_xgb))
    print(classification_report(y_test, y_test_pred_xgb))
    print("Confusion matrix of the Train data:")
    print(confusion_matrix(y_train, y_train_pred_xgb))
    print(classification_report(y_train, y_train_pred_xgb))

    # Log-loss for both test and training data
    print("The log-likelihood loss for XGBoost (after tuning hyperparameters) from test set is", log_loss(y_test, y_test_pred_xgb_proba))
    print("The log-likelihood loss for XGBoost (after tuning hyperparameters) from training set is", log_loss(y_train, y_train_pred_xgb_proba))

    # Calculate train and test scores for the selected scoring metric
    if scoring == 'roc_auc':
        train_score = roc_auc_score(y_train, y_train_pred_xgb_proba)
        test_score = roc_auc_score(y_test, y_test_pred_xgb_proba)
    elif scoring == 'f1':
        train_score = f1_score(y_train, y_train_pred_xgb)
        test_score = f1_score(y_test, y_test_pred_xgb)
    elif scoring == 'recall':
        train_score = recall_score(y_train, y_train_pred_xgb)
        test_score = recall_score(y_test, y_test_pred_xgb)
    elif scoring == f2_scorer:
        train_score = fbeta_score(y_train, y_train_pred_xgb, beta=2)
        test_score = fbeta_score(y_test, y_test_pred_xgb, beta=2)
    elif scoring == 'neg_log_loss':
        train_score = log_loss(y_train, y_train_pred_xgb_proba)
        test_score = log_loss(y_test, y_test_pred_xgb_proba)
    else:
        train_score, test_score = "N/A", "N/A"

    # Display train and test scores for the selected scoring metric
    print(f"Train {scoring} score: {train_score:.5f}" if isinstance(train_score, (float, int)) else f"{scoring} score could not be calculated for train set.")
    print(f"Test {scoring} score: {test_score:.5f}" if isinstance(test_score, (float, int)) else f"{scoring} score could not be calculated for test set.")
    
    print("\n====================================\n")


Model performance using scoring: neg_log_loss
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best XGBoost hyperparameters found:  {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Cross-validation neg_log_loss scores for 5 folds:  [-0.17869777 -0.17333059 -0.18299881 -0.22116433 -0.17331334]
Mean cross-validation neg_log_loss score = -0.186
____________________
Test Accuracy using the best XGBoost model: 0.94479
Test AUC using the best XGBoost model: 0.82505
Confusion matrix of the Test data:
[[901  18]
 [ 35   6]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       919
           1       0.25      0.15      0.18        41

    accuracy                           0.94       960
   macro avg       0.61      0.56      0.58       960
weighted avg       0.93      0.94      0.94       960

Confusion matrix of the Train data:
[[3668    4]
 [   0  166]]
              preci