In [None]:
!pip install --user scikeras==0.13.0
import sklearn
import scikeras
print("scikit-learn version:", sklearn.__version__)
print("scikeras version:", scikeras.__version__)

import pandas as pd
import numpy as np
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, auc
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [None]:
# Step 1: Load the training, test, and external validation datasets
train_data = pd.read_csv("train_Boarderline_smote_B_data.csv")
test_data = pd.read_csv("test_B_data.csv")
eval_data = pd.read_csv("external_eval_B_data.csv")

# Step 2: Separate labels and features
y_train, X_train = train_data.iloc[:, 1], train_data.iloc[:, 2:]
y_test, X_test = test_data.iloc[:, 1], test_data.iloc[:, 2:]
y_eval, X_eval = eval_data.iloc[:, 1], eval_data.iloc[:, 2:]

# Convert labels to categorical (binary classification)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_eval = to_categorical(y_eval)

# Step 3: Define the Neural Network
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')  # 2 nodes for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Step 4: Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Step 5: Evaluate the model on test and external validation data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
eval_loss, eval_accuracy = model.evaluate(X_eval, y_eval, verbose=0)

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"External Validation Accuracy: {eval_accuracy * 100:.2f}%")

# Step 6: Classification Reports and Confusion Matrices
y_test_pred = np.argmax(model.predict(X_test), axis=1)
y_eval_pred = np.argmax(model.predict(X_eval), axis=1)

print("\nTest Data - Classification Report:\n", classification_report(np.argmax(y_test, axis=1), y_test_pred))
print("\nExternal Validation - Classification Report:\n", classification_report(np.argmax(y_eval, axis=1), y_eval_pred))

print("\nTest Data - Confusion Matrix:\n", confusion_matrix(np.argmax(y_test, axis=1), y_test_pred))
print("\nExternal Validation - Confusion Matrix:\n", confusion_matrix(np.argmax(y_eval, axis=1), y_eval_pred))

In [None]:
# Step 7: ROC Curves
y_test_prob = model.predict(X_test)[:, 1]
y_eval_prob = model.predict(X_eval)[:, 1]

fpr_test, tpr_test, _ = roc_curve(np.argmax(y_test, axis=1), y_test_prob)
fpr_eval, tpr_eval, _ = roc_curve(np.argmax(y_eval, axis=1), y_eval_prob)

roc_auc_test = auc(fpr_test, tpr_test)
roc_auc_eval = auc(fpr_eval, tpr_eval)

plt.figure(figsize=(10, 5))

# Test ROC-AUC plot
plt.subplot(1, 2, 1)
plt.plot(fpr_test, tpr_test, color='blue', lw=2, label=f'ROC AUC = {roc_auc_test:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.title('Test Data - ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')

# External Validation ROC-AUC plot
plt.subplot(1, 2, 2)
plt.plot(fpr_eval, tpr_eval, color='green', lw=2, label=f'ROC AUC = {roc_auc_eval:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.title('External Validation - ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')

plt.tight_layout()

# Save the figure
plt.savefig("nn_roc_auc_bcell_curves.png", dpi=500)

plt.show()

# Step 8: Save the model
model.save("nn_epitope_bcell_classifier.h5")
print("Model saved as 'nn_epitope_bcell_classifier.h5'")

from joblib import dump

# Save the model using joblib
model_file = "nn_epitope_bcell_classifier.pkl"

# Save the model's architecture and weights as a tuple
dump((model.to_json(), model.get_weights()), model_file)

print(f"Model saved as '{model_file}'")

In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate probabilities for ROC curves
y_test_prob = model.predict(X_test)[:, 1]
y_eval_prob = model.predict(X_eval)[:, 1]

# Calculate FPR, TPR, and thresholds for test and validation sets
fpr_test, tpr_test, thresholds_test = roc_curve(np.argmax(y_test, axis=1), y_test_prob)
fpr_eval, tpr_eval, thresholds_eval = roc_curve(np.argmax(y_eval, axis=1), y_eval_prob)

# Calculate ROC AUC values
roc_auc_test = auc(fpr_test, tpr_test)
roc_auc_eval = auc(fpr_eval, tpr_eval)

# Print the metrics
print("Test Data Metrics:")
print(f"True Positive Rate (TPR): {tpr_test}")
print(f"False Positive Rate (FPR): {fpr_test}")
print(f"Thresholds: {thresholds_test}")
print(f"ROC AUC: {roc_auc_test:.2f}")

print("\nExternal Validation Data Metrics:")
print(f"True Positive Rate (TPR): {tpr_eval}")
print(f"False Positive Rate (FPR): {fpr_eval}")
print(f"Thresholds: {thresholds_eval}")
print(f"ROC AUC: {roc_auc_eval:.2f}")


In [None]:
# Convert one-hot encoded true labels to single class labels
y_test_labels = np.argmax(y_test, axis=1)
y_eval_labels = np.argmax(y_eval, axis=1)
y_test_pred = np.argmax(model.predict(X_test), axis=1)
y_eval_pred = np.argmax(model.predict(X_eval), axis=1)
from sklearn.metrics import confusion_matrix

# Confusion Matrix for Test Data
cm_test = confusion_matrix(y_test_labels, y_test_pred)
print("Confusion Matrix (Test Data):\n", cm_test)

# Confusion Matrix for External Validation Data
cm_eval = confusion_matrix(y_eval_labels, y_eval_pred)
print("Confusion Matrix (External Validation Data):\n", cm_eval)

# For Test Data
if cm_test.shape == (2, 2):  # Ensure it's binary classification
    tn_test, fp_test, fn_test, tp_test = cm_test.ravel()

    # Sensitivity (Recall)
    test_sensitivity = tp_test / (tp_test + fn_test)

    # Specificity
    test_specificity = tn_test / (tn_test + fp_test)

    print(f"Test Sensitivity: {test_sensitivity:.2f}")
    print(f"Test Specificity: {test_specificity:.2f}")
else:
    print("Confusion matrix is not binary. Cannot compute sensitivity/specificity.")

# For External Validation Data
if cm_eval.shape == (2, 2):  # Ensure it's binary classification
    tn_eval, fp_eval, fn_eval, tp_eval = cm_eval.ravel()

    # Sensitivity (Recall)
    eval_sensitivity = tp_eval / (tp_eval + fn_eval)

    # Specificity
    eval_specificity = tn_eval / (tn_eval + fp_eval)

    print(f"External Validation Sensitivity: {eval_sensitivity:.2f}")
    print(f"External Validation Specificity: {eval_specificity:.2f}")
else:
    print("Confusion matrix is not binary. Cannot compute sensitivity/specificity.")


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import numpy as np
import matplotlib.pyplot as plt
import joblib

from tensorflow.keras.models import load_model

# Load the model
loaded_model = load_model("nn_epitope_bcell_classifier.h5")

# Recompile the model with a fresh optimizer
loaded_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define StratifiedKFold for 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Variables to store fold metrics
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
accuracies = []

# Perform 10-fold cross-validation
for i, (train_idx, test_idx) in enumerate(kf.split(X_train, np.argmax(y_train, axis=1))):
    # Split data for this fold
    X_fold_train, X_fold_test = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_fold_train, y_fold_test = y_train[train_idx], y_train[test_idx]
    
    # Fit the model
    loaded_model.fit(X_fold_train, y_fold_train, epochs=50, batch_size=32, verbose=0)
    
    # Predict probabilities
    y_prob = loaded_model.predict(X_fold_test)[:, 1]
    
    # Calculate ROC curve and AUC for this fold
    fpr, tpr, _ = roc_curve(np.argmax(y_fold_test, axis=1), y_prob)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    
    # Plot this fold's ROC curve
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label=f'Fold {i+1} (AUC = {roc_auc:.2f})')
    
    # Calculate accuracy for this fold
    accuracy = loaded_model.evaluate(X_fold_test, y_fold_test, verbose=0)[1]
    accuracies.append(accuracy)

# Plot mean ROC curve
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue', lw=2, label=f'Mean ROC (AUC = {mean_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.tight_layout()

# Save the ROC curve plot
plt.savefig("cv_roc_curve_bcell_nn.png", dpi=500)
plt.show()

# Save accuracies and mean accuracy to a text file
with open('cv_accuracies_bcell_nn.txt', 'w') as f:
    f.write("Cross-Validation Accuracies for each fold:\n")
    for i, accuracy in enumerate(accuracies):
        f.write(f"Fold {i+1} Accuracy: {accuracy:.4f}\n")
    f.write(f"\nMean 10-Fold CV Accuracy: {np.mean(accuracies):.4f}\n")
    f.write(f"Standard Deviation of Accuracy: {np.std(accuracies):.4f}\n")

print("Accuracies and ROC curve saved successfully!")

In [None]:
# After computing accuracies for each fold
print("Cross-Validation Accuracies for each fold:")
for i, accuracy in enumerate(accuracies):
    print(f"Fold {i+1} Accuracy: {accuracy:.4f}")

mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Print mean accuracy and standard deviation
print(f"\nMean 10-Fold CV Accuracy: {mean_accuracy:.4f}")
print(f"Standard Deviation of Accuracy: {std_accuracy:.4f}")

In [None]:
# Convert one-hot encoded true labels to single class labels
y_test_labels = np.argmax(y_test, axis=1)
y_eval_labels = np.argmax(y_eval, axis=1)

y_test_pred = np.argmax(model.predict(X_test), axis=1)
y_eval_pred = np.argmax(model.predict(X_eval), axis=1)

from sklearn.metrics import confusion_matrix

# Confusion Matrix for Test Data
cm_test = confusion_matrix(y_test_labels, y_test_pred)
print("Confusion Matrix (Test Data):\n", cm_test)

# Confusion Matrix for External Validation Data
cm_eval = confusion_matrix(y_eval_labels, y_eval_pred)
print("Confusion Matrix (External Validation Data):\n", cm_eval)

In [None]:
# For Test Data
if cm_test.shape == (2, 2):  # Ensure it's binary classification
    tn_test, fp_test, fn_test, tp_test = cm_test.ravel()

    # Sensitivity (Recall)
    test_sensitivity = tp_test / (tp_test + fn_test)

    # Specificity
    test_specificity = tn_test / (tn_test + fp_test)

    print(f"Test Sensitivity: {test_sensitivity:.2f}")
    print(f"Test Specificity: {test_specificity:.2f}")
else:
    print("Confusion matrix is not binary. Cannot compute sensitivity/specificity.")

# For External Validation Data
if cm_eval.shape == (2, 2):  # Ensure it's binary classification
    tn_eval, fp_eval, fn_eval, tp_eval = cm_eval.ravel()

    # Sensitivity (Recall)
    eval_sensitivity = tp_eval / (tp_eval + fn_eval)

    # Specificity
    eval_specificity = tn_eval / (tn_eval + fp_eval)

    print(f"External Validation Sensitivity: {eval_sensitivity:.2f}")
    print(f"External Validation Specificity: {eval_specificity:.2f}")
else:
    print("Confusion matrix is not binary. Cannot compute sensitivity/specificity.")