In [None]:

pip install xgboost 
import pandas as pd
import joblib
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc
)
import matplotlib.pyplot as plt

In [None]:
# Step 1: Load the datasets (datasets were already splited and saved as csv files, 60, 20, 20 split)
train_data = pd.read_csv("train_Boarderline_smote_B_data.csv") #train data
test_data = pd.read_csv("test_B_data.csv") #test data
eval_data = pd.read_csv("external_eval_B_data.csv") #external eval data

# Step 2: Separate labels and features
y_train, X_train = train_data.iloc[:, 1], train_data.iloc[:, 2:]
y_test, X_test = test_data.iloc[:, 1], test_data.iloc[:, 2:]
y_eval, X_eval = eval_data.iloc[:, 1], eval_data.iloc[:, 2:]

In [None]:
# Step 3: Define the XGBoost model and hyperparameter grid
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42) #Random state for reproducibility, logloss for binary classification

param_grid = {
    'n_estimators': [50, 100, 200], #Number of boosting rounds:Range: 100-1000, Default: 100
    'max_depth': [3, 6, 10], #Maximum tree depth:Range: 3-10, Default: 6
    'learning_rate': [0.01, 0.1, 0.2], #Step size shrinkage used to prevent ovexgbitting:Range: 0.01-0.3, Default: 0.3
    'subsample': [0.6, 0.8, 1.0], #Fraction of samples used for training trees, Range: 0.5-1.0, Default: 1
    'min_child_weight': [1], #Minimum sum of instance weight (hessian) needed in a child, Range: 1-10, Default: 1
}

# Step 4: Apply GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid, #param_grid is defined above
    scoring='accuracy', #scoring is set to 'accuracy'
    n_jobs= 1, #number of jobs to run in parallel, reasonable value is -1
    cv=5,
    verbose=2 #verbose means how much information to print, 2 means print everything
)
grid_search.fit(X_train, y_train)

# Step 5: Print the Best Hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Step 6: Use the best model from GridSearchCV
best_xgb_model = grid_search.best_estimator_

# Save best hyperparameters to a file
with open('best_model_info_xgb.txt', 'w') as f:
    f.write("Best Hyperparameters:\n")
    for param, value in grid_search.best_params_.items():
        f.write(f"{param}: {value}\n")

print("Best model information has been saved to 'best_model_info_xgb.txt'")

#save best model to a file
joblib.dump(best_xgb_model, "best_xgb_model.pkl")

In [None]:
# Step 8: Fit the model on the training data
best_xgb_model.fit(X_train, y_train)

# Step 9: Evaluate the model on the test set
y_test_pred = best_xgb_model.predict(X_test)
y_test_prob = best_xgb_model.predict_proba(X_test)[:, 1]

# Step 10: Evaluate the model on the external validation set
y_eval_pred = best_xgb_model.predict(X_eval)
y_eval_prob = best_xgb_model.predict_proba(X_eval)[:, 1]

# Step 11: Calculate and print accuracy, classification report, and confusion matrix
test_accuracy = accuracy_score(y_test, y_test_pred)
ext_accuracy = accuracy_score(y_eval, y_eval_pred)

print(f"XGBoost Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"XGBoost External Validation Accuracy: {ext_accuracy * 100:.2f}%")

# Step 11: Compute confusion matrices
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)
confusion_matrix_eval = confusion_matrix(y_eval, y_eval_pred)
print("Confusion Matrix (Test Data):\n", confusion_matrix_test)
print("Confusion Matrix (Evaluation Data):\n", confusion_matrix_eval)

# Step 12: Compute specificity and sensitivity for test and evaluation data
# Sensitivity (Recall) and Specificity calculation
def calculate_sensitivity_specificity(conf_matrix):
    tn, fp, fn, tp = conf_matrix.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return sensitivity, specificity

sensitivity_test, specificity_test = calculate_sensitivity_specificity(confusion_matrix_test)
sensitivity_eval, specificity_eval = calculate_sensitivity_specificity(confusion_matrix_eval)
print("Test Data - Sensitivity:", sensitivity_test, "Specificity:", specificity_test)
print("Evaluation Data - Sensitivity:", sensitivity_eval, "Specificity:", specificity_eval)

# Step 13: ROC and AUC for Test and Evaluation Datasets
y_test_prob = best_xgb_model.predict_proba(X_test)[:, 1]
y_eval_prob = best_xgb_model.predict_proba(X_eval)[:, 1]

# ROC for test dataset
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr_test, tpr_test)
plt.figure()
plt.plot(fpr_test, tpr_test, label=f"Test ROC curve (AUC = {roc_auc_test:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Test Data")
plt.legend(loc="lower right")
plt.show()

# ROC for evaluation dataset
fpr_eval, tpr_eval, _ = roc_curve(y_eval, y_eval_prob)
roc_auc_eval = auc(fpr_eval, tpr_eval)
plt.figure()
plt.plot(fpr_eval, tpr_eval, label=f"Evaluation ROC curve (AUC = {roc_auc_eval:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Evaluation Data")
plt.legend(loc="lower right")
plt.show()

# Save the figure
plt.savefig("xgb_roc_auc_curves.png", dpi=500)
plt.savefig("xgb_roc_auc_curves.pdf")

# Save the trained model to a file
model_filename = "best_xgb_model.pkl"
joblib.dump(best_xgb_model, model_filename)
print(f"Model saved as {model_filename}")

In [None]:
# Set up XGBoost model with extra weight for Class 1
xgb_model_1 = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=16,  # Experiment with values like 1.5, 2.0, etc.8, 16, 
    max_depth=10,
    learning_rate=0.2,
    n_estimators=1000,  #200, 500, 1000
    subsample=1.0,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    random_state=42
)

# Fit the model
xgb_model_1.fit(X_train, y_train)

In [None]:
# Predict probabilities for Class 1
y_test_prob = xgb_model_1.predict_proba(X_test)[:, 1]
y_eval_prob = xgb_model_1.predict_proba(X_eval)[:, 1]

# Convert probabilities to binary predictions (using default threshold = 0.5)
y_test_pred = (y_test_prob >= 0.5).astype(int)
y_eval_pred = (y_eval_prob >= 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Test Metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, pos_label=1)
test_recall = recall_score(y_test, y_test_pred, pos_label=1)
test_f1 = f1_score(y_test, y_test_pred, pos_label=1)

# Validation Metrics
val_accuracy = accuracy_score(y_eval, y_eval_pred)
val_precision = precision_score(y_eval, y_eval_pred, pos_label=1)
val_recall = recall_score(y_eval, y_eval_pred, pos_label=1)
val_f1 = f1_score(y_eval, y_eval_pred, pos_label=1)

# Print results
print(f"Test Set: Accuracy = {test_accuracy:.2f}, Precision = {test_precision:.2f}, Recall = {test_recall:.2f}, F1 Score = {test_f1:.2f}")
print(f"Validation Set: Accuracy = {val_accuracy:.2f}, Precision = {val_precision:.2f}, Recall = {val_recall:.2f}, F1 Score = {val_f1:.2f}")

In [None]:
y_test_prob = xgb_model_1.predict_proba(X_test)[:, 1]  # Probabilities for Class 1
y_eval_prob = xgb_model_1.predict_proba(X_eval)[:, 1]

In [None]:
#TUNING THE THRESHHOLD
import numpy as np
from sklearn.metrics import f1_score

y_test_prob = xgb_model_1.predict_proba(X_test)[:, 1]  # Probabilities for Class 1
y_eval_prob = xgb_model_1.predict_proba(X_eval)[:, 1]

best_threshold = 0.5
best_f1 = 0

for threshold in np.arange(0.1, 0.9, 0.05):
    y_test_adjusted = (y_test_prob >= threshold).astype(int)
    f1 = f1_score(y_test, y_test_adjusted, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best Threshold: {best_threshold}, Best F1 Score: {best_f1:.2f}")

# Use the best threshold
y_test_final = (y_test_prob >= best_threshold).astype(int)
y_eval_final = (y_eval_prob >= best_threshold).astype(int)

# Evaluate final metrics
test_precision = precision_score(y_test, y_test_final, pos_label=1)
test_recall = recall_score(y_test, y_test_final, pos_label=1)
test_f1 = f1_score(y_test, y_test_final, pos_label=1)

eval_precision = precision_score(y_eval, y_eval_final, pos_label=1)
eval_recall = recall_score(y_eval, y_eval_final, pos_label=1)
eval_f1 = f1_score(y_eval, y_eval_final, pos_label=1)

print("\nTest Set Metrics (Optimized Threshold):")
print(f"Precision: {test_precision:.2f}, Recall: {test_recall:.2f}, F1 Score: {test_f1:.2f}")

print("\nValidation Set Metrics (Optimized Threshold):")
print(f"Precision: {val_precision:.2f}, Recall: {val_recall:.2f}, F1 Score: {val_f1:.2f}")

In [None]:
import pickle

# Save the tuned XGBoost model to a file
with open('tuned_xgb_bcell_model.pkl', 'wb') as file:
    pickle.dump(xgb_model_1, file)

print("Tuned model saved as 'tuned_xgb_bcell_model.pkl'.")

In [None]:
# Step 8: Fit the model on the training data
xgb_model_1.fit(X_train, y_train)

# Step 9: Evaluate the model on the test set
y_test_pred = xgb_model_1.predict(X_test)
y_test_prob = xgb_model_1.predict_proba(X_test)[:, 1]

# Step 10: Evaluate the model on the external validation set
y_eval_pred = xgb_model_1.predict(X_eval)
y_eval_prob = xgb_model_1.predict_proba(X_eval)[:, 1]

# Step 11: Calculate and print accuracy, classification report, and confusion matrix
test_accuracy = accuracy_score(y_test, y_test_pred)
ext_accuracy = accuracy_score(y_eval, y_eval_pred)

print(f"XGBoost Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"XGBoost External Validation Accuracy: {ext_accuracy * 100:.2f}%")

# Step 11: Compute confusion matrices
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)
confusion_matrix_eval = confusion_matrix(y_eval, y_eval_pred)
print("Confusion Matrix (Test Data):\n", confusion_matrix_test)
print("Confusion Matrix (Evaluation Data):\n", confusion_matrix_eval)

# Step 12: Compute specificity and sensitivity for test and evaluation data
# Sensitivity (Recall) and Specificity calculation
def calculate_sensitivity_specificity(conf_matrix):
    tn, fp, fn, tp = conf_matrix.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return sensitivity, specificity

sensitivity_test, specificity_test = calculate_sensitivity_specificity(confusion_matrix_test)
sensitivity_eval, specificity_eval = calculate_sensitivity_specificity(confusion_matrix_eval)
print("Test Data - Sensitivity:", sensitivity_test, "Specificity:", specificity_test)
print("Evaluation Data - Sensitivity:", sensitivity_eval, "Specificity:", specificity_eval)

# Step 13: ROC and AUC for Test and Evaluation Datasets
y_test_prob = xgb_model_1.predict_proba(X_test)[:, 1]
y_eval_prob = xgb_model_1.predict_proba(X_eval)[:, 1]

# ROC for test dataset
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr_test, tpr_test)
plt.figure()
plt.plot(fpr_test, tpr_test, label=f"Test ROC curve (AUC = {roc_auc_test:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Test Data")
plt.legend(loc="lower right")
plt.show()

# ROC for evaluation dataset
fpr_eval, tpr_eval, _ = roc_curve(y_eval, y_eval_prob)
roc_auc_eval = auc(fpr_eval, tpr_eval)
plt.figure()
plt.plot(fpr_eval, tpr_eval, label=f"Evaluation ROC curve (AUC = {roc_auc_eval:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Evaluation Data")
plt.legend(loc="lower right")
plt.show()

# Save the figure
plt.savefig("tuned_xgb_bcell_roc_auc_curves.png", dpi=500)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Generate ROC for test dataset
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr_test, tpr_test)

# Generate ROC for evaluation dataset
fpr_eval, tpr_eval, thresholds_eval = roc_curve(y_eval, y_eval_prob)
roc_auc_eval = auc(fpr_eval, tpr_eval)

# Print the values
print("Test Data ROC AUC:", roc_auc_test)
print("Test Data FPR:", fpr_test)
print("Test Data TPR:", tpr_test)
print("Test Data Thresholds:", thresholds_test)

print("Evaluation Data ROC AUC:", roc_auc_eval)
print("Evaluation Data FPR:", fpr_eval)
print("Evaluation Data TPR:", tpr_eval)
print("Evaluation Data Thresholds:", thresholds_eval)

# Create subplots for side-by-side ROC curves
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot Test Data ROC
axes[0].plot(fpr_test, tpr_test, label=f"ROC AUC = {roc_auc_test:.2f}", color="blue")
axes[0].plot([0, 1], [0, 1], 'k--', label="Random Classifier (AUC = 0.50)")
axes[0].set_title("Test Data - ROC Curve")
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].legend(loc="lower right")

# Plot Evaluation Data ROC
axes[1].plot(fpr_eval, tpr_eval, label=f"ROC AUC = {roc_auc_eval:.2f}", color="green")
axes[1].plot([0, 1], [0, 1], 'k--', label="Random Classifier (AUC = 0.50)")
axes[1].set_title("External Validation - ROC Curve")
axes[1].set_xlabel("False Positive Rate")
axes[1].set_ylabel("True Positive Rate")
axes[1].legend(loc="lower right")

# Adjust layout and save figure
plt.tight_layout()
#plt.savefig("tuned_xgb_bcell_roc_auc_curves_1.png", dpi=500)
#print("ROC curves saved as 'xgb_roc_auc_curves.png'.")

# Show the plot
plt.show()


In [None]:
#calculation of classification report for test and evaluation data
print("Classification Report (Test Data):\n", classification_report(y_test, y_test_pred))
print("Classification Report (Evaluation Data):\n", classification_report(y_eval, y_eval_pred))

In [None]:
from sklearn.metrics import f1_score
# calculation of F1 score
f1_test = f1_score(y_test, y_test_pred)
f1_eval = f1_score(y_eval, y_eval_pred)
print("F1 Score (Test Data):", f1_test)
print("F1 Score (Evaluation Data):", f1_eval)

In [None]:
# Perform 10-fold cross-validation on the best model
cross_val_scores = cross_val_score(xgb_model_1, X_train, y_train, cv=10, scoring='accuracy')
print("10-Fold Cross-Validation Accuracy Scores:", cross_val_scores)
print("Mean 10-Fold CV Accuracy:", cross_val_scores.mean())

# save accuracy of each fold to a text file
with open('cv_accuracies_xgb_tuned.txt', 'w') as f:
    f.write("Cross-Validation Accuracies for each fold:\n")
    for i, score in enumerate(cross_val_scores):
        f.write(f"Fold {i+1} Accuracy: {score:.4f}\n")

print("Accuracy of each fold has been saved to 'cv_accuracies_xgb_bcell_tuned.txt'")


In [None]:
# make roc curve of all folds with mean auc and mean accuracy printed on the plot
mean_accuracy = cross_val_scores.mean() * 100
print(f"Mean Accuracy: {mean_accuracy:.2f}")
# Step 6: Generate ROC Curve
mean_auc = roc_auc_score(y_train, xgb_model_1.predict_proba(X_train)[:, 1])
print(f"Mean AUC: {mean_auc:.4f}")

cv = StratifiedKFold(n_splits=10)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
for i, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    xgb_model_1.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_prob = xgb_model_1.predict_proba(X_train.iloc[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_prob)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label=f'Fold {i+1} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, label=f'Mean ROC (AUC = {mean_auc:.2f})', lw=2)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.savefig("cv_roc_curve_xgb_tuned_bcell.png", dpi=500)
plt.show()