Adaboost Model Script

#importing librariers

In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc
)
import matplotlib.pyplot as plt

In [None]:
# Step 1: Load the datasets (datasets were already splited and saved as csv files, 60, 20, 20 split)
train_data = pd.read_csv("train_Boarderline_smote_B_data.csv") #train data
test_data = pd.read_csv("test_B_data.csv") #test data
eval_data = pd.read_csv("external_eval_B_data.csv") #external eval data

# Step 2: Separate labels and features
y_train, X_train = train_data.iloc[:, 1], train_data.iloc[:, 2:]
y_test, X_test = test_data.iloc[:, 1], test_data.iloc[:, 2:]
y_eval, X_eval = eval_data.iloc[:, 1], eval_data.iloc[:, 2:]

In [None]:
# Define the AdaBoost model and hyperparameter grid
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)
adaboost_model = AdaBoostClassifier(estimator=base_estimator, random_state=42) #base_estimator=base_estimator means use the DecisionTreeClassifier as the base estimator

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],               # Number of weak learners, default is 50, range is 10 to 1000
    'learning_rate': [0.01, 0.1, 1.0],            # Controls contribution of each weak learner, default is 1.0, range is 0.01 to 1.0
    'algorithm': ['SAMME.R'],            # Adaptive boosting algorithm, default is SAMME.R, range is SAMME and SAMME.R; SAMME is equivalent to AdaBoost, SAMME.R is generally faster and often performs better for classification tasks.
    'estimator__max_depth': [1, 2, 3],       # Depth of the decision tree base estimator, default is 1, range is 1 to 10
    'estimator__max_features': ['sqrt', 'log2'], # Number of features to consider when looking for the best split
}
# GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=adaboost_model,
    param_grid=param_grid,
    scoring='accuracy',
    n_jobs=1,
    cv=5,
    verbose=2,
)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Save best hyperparameters to a file
with open('best_model_info_adaboost_bcell.txt', 'w') as f:
    f.write("Best Hyperparameters:\n")
    for param, value in grid_search.best_params_.items():
        f.write(f"{param}: {value}\n")

print("Best model information has been saved to 'best_model_info_adaboost_bcell.txt'")

In [None]:
# Step 6: Use the best model from GridSearchCV
best_adaboost_model = grid_search.best_estimator_

#save best model to a file
joblib.dump(best_adaboost_model, "best_adaboost_bcell_model.pkl")

In [None]:
# Step 8: Fit the model on the training data
best_adaboost_model.fit(X_train, y_train)

# Evaluate on the test dataset
y_test_pred = best_adaboost_model.predict(X_test)
y_test_prob = best_adaboost_model.predict_proba(X_test)[:, 1]

# Evaluate on the evaluation dataset
y_eval_pred = best_adaboost_model.predict(X_eval)
y_eval_prob = best_adaboost_model.predict_proba(X_eval)[:, 1]

# Calculate and print accuracy, classification reports, and confusion matrices
test_accuracy = accuracy_score(y_test, y_test_pred)
eval_accuracy = accuracy_score(y_eval, y_eval_pred)
print(f"AdaBoost Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"AdaBoost Evaluation Accuracy: {eval_accuracy * 100:.2f}%")

# Confusion Matrices
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)
confusion_matrix_eval = confusion_matrix(y_eval, y_eval_pred)
print("Confusion Matrix (Test Data):\n", confusion_matrix_test)
print("Confusion Matrix (Evaluation Data):\n", confusion_matrix_eval)

# Sensitivity and Specificity calculation
def calculate_sensitivity_specificity(conf_matrix):
    tn, fp, fn, tp = conf_matrix.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return sensitivity, specificity

sensitivity_test, specificity_test = calculate_sensitivity_specificity(confusion_matrix_test)
sensitivity_eval, specificity_eval = calculate_sensitivity_specificity(confusion_matrix_eval)
print("Test Data - Sensitivity:", sensitivity_test, "Specificity:", specificity_test)
print("Evaluation Data - Sensitivity:", sensitivity_eval, "Specificity:", specificity_eval)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Generate ROC for test dataset
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr_test, tpr_test)

# Generate ROC for evaluation dataset
fpr_eval, tpr_eval, thresholds_eval = roc_curve(y_eval, y_eval_prob)
roc_auc_eval = auc(fpr_eval, tpr_eval)

# Print the values
print("Test Data ROC AUC:", roc_auc_test)
print("Test Data FPR:", fpr_test)
print("Test Data TPR:", tpr_test)
print("Test Data Thresholds:", thresholds_test)

print("Evaluation Data ROC AUC:", roc_auc_eval)
print("Evaluation Data FPR:", fpr_eval)
print("Evaluation Data TPR:", tpr_eval)
print("Evaluation Data Thresholds:", thresholds_eval)

# Create subplots for side-by-side ROC curves
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot Test Data ROC
axes[0].plot(fpr_test, tpr_test, label=f"ROC AUC = {roc_auc_test:.2f}", color="blue")
axes[0].plot([0, 1], [0, 1], 'k--', label="Random Classifier (AUC = 0.50)")
axes[0].set_title("Test Data - ROC Curve")
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].legend(loc="lower right")

# Plot Evaluation Data ROC
axes[1].plot(fpr_eval, tpr_eval, label=f"ROC AUC = {roc_auc_eval:.2f}", color="green")
axes[1].plot([0, 1], [0, 1], 'k--', label="Random Classifier (AUC = 0.50)")
axes[1].set_title("External Validation - ROC Curve")
axes[1].set_xlabel("False Positive Rate")
axes[1].set_ylabel("True Positive Rate")
axes[1].legend(loc="lower right")

# Adjust layout and save figure
plt.tight_layout()
plt.savefig("adaboost_bcell_roc_auc_curves.png", dpi=500)
print("ROC curves saved as 'adaboost_bcell_roc_auc_curves.png'.")

# Show the plot
plt.show()

In [None]:
from sklearn.metrics import f1_score
# calculation of F1 score
f1_test = f1_score(y_test, y_test_pred)
f1_eval = f1_score(y_eval, y_eval_pred)
print("F1 Score (Test Data):", f1_test)
print("F1 Score (Evaluation Data):", f1_eval)

In [None]:
#calculation of classification report for test and evaluation data
print("Classification Report (Test Data):\n", classification_report(y_test, y_test_pred))
print("Classification Report (Evaluation Data):\n", classification_report(y_eval, y_eval_pred))

In [None]:
# Perform 10-fold cross-validation on the best model
cross_val_scores = cross_val_score(best_adaboost_model, X_train, y_train, cv=10, scoring='accuracy')
print("10-Fold Cross-Validation Accuracy Scores:", cross_val_scores)
print("Mean 10-Fold CV Accuracy:", cross_val_scores.mean())

# save accuracy of each fold to a text file
with open('cv_accuracies_bcell_adaboost.txt', 'w') as f:
    f.write("Cross-Validation Accuracies for each fold:\n")
    for i, score in enumerate(cross_val_scores):
        f.write(f"Fold {i+1} Accuracy: {score:.4f}\n")

print("Accuracy of each fold has been saved to 'cv_accuracies_bcell_adaboost.txt'")