In [None]:
!pip install keras==3.2.0 tensorflow-intel==2.17.0 scikeras==0.13.0
!pip show keras tensorflow-intel scikeras
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(keras.__version__)

In [None]:
!pip show keras
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

In [None]:
import pandas as pd

# Load your dataset
train_data = pd.read_csv("train_Boarderline_smote_B_data.csv")

# Inspect the first few rows of your dataset
print(train_data.head())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Extract epitope sequences (first column)
sequences = train_data.iloc[:, 0].values

# Use LabelEncoder to convert sequences into integer labels
label_encoder = LabelEncoder()
X_seq = label_encoder.fit_transform(sequences)  # Convert sequences to integers

# Verify the encoding
print(f"First 10 Encoded Sequences: {X_seq[:10]}")
print(f"Number of Unique Sequences: {len(label_encoder.classes_)}")

In [None]:
# Reshape integer-encoded sequences for compatibility
X_seq = X_seq.reshape(-1, 1)

# Extract numeric features (columns 3 onward)
X_numeric = train_data.iloc[:, 2:].values

# Combine sequence integers and numeric features
import numpy as np
X_combined = np.hstack((X_seq, X_numeric))

# Check combined data shape
print(f"Combined feature shape: {X_combined.shape}")

In [None]:
import pandas as pd

# Load datasets
train_data = pd.read_csv("train_Boarderline_smote_B_data.csv")
test_data = pd.read_csv("test_B_data.csv")
eval_data = pd.read_csv("external_eval_B_data.csv")

# Inspect the datasets
print(f"Train Data Shape: {train_data.shape}")
print(f"Test Data Shape: {test_data.shape}")
print(f"Evaluation Data Shape: {eval_data.shape}")

In [None]:
#Step 2: Preprocess the Data, Encode Epitope Sequences, Convert the sequences in the first column of each dataset into integers using LabelEncoder:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Combine all sequences from train, test, and eval
all_sequences = pd.concat([train_data.iloc[:, 0], test_data.iloc[:, 0], eval_data.iloc[:, 0]])

# Fit LabelEncoder on all unique sequences
label_encoder = LabelEncoder()
label_encoder.fit(all_sequences)

# Transform sequences in all datasets
X_train_seq = label_encoder.transform(train_data.iloc[:, 0])  # Train sequences
X_test_seq = label_encoder.transform(test_data.iloc[:, 0])    # Test sequences
X_eval_seq = label_encoder.transform(eval_data.iloc[:, 0])    # Eval sequences

# Verify encoding
print(f"Number of unique sequences (vocabulary size): {len(label_encoder.classes_)}")

##Combine Encoded Features with Numeric Features
import numpy as np
# Reshape sequence encodings for concatenation
X_train_seq = X_train_seq.reshape(-1, 1)
X_test_seq = X_test_seq.reshape(-1, 1)
X_eval_seq = X_eval_seq.reshape(-1, 1)

# Extract numeric features (columns 3 onward) from each dataset
X_train_numeric = train_data.iloc[:, 2:].values
X_test_numeric = test_data.iloc[:, 2:].values
X_eval_numeric = eval_data.iloc[:, 2:].values

# Combine sequence encodings and numeric features
X_train = np.hstack((X_train_seq, X_train_numeric))
X_test = np.hstack((X_test_seq, X_test_numeric))
X_eval = np.hstack((X_eval_seq, X_eval_numeric))

print(f"Train Feature Shape: {X_train.shape}")
print(f"Test Feature Shape: {X_test.shape}")
print(f"Eval Feature Shape: {X_eval.shape}")

In [None]:
##Step 1: Define the Model Function for Hyperparameter Tuning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def create_model(learning_rate=0.001, filters=32, kernel_size=3, dense_units=128, dropout_rate=0.3):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=50, input_length=1),  # Embedding Layer
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(64, activation='relu'),
        Dense(2, activation='softmax')  # Binary classification
    ])
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model


In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# Ensure this code has been run before grid search:
# Combine sequence encoding and numeric features
X_train = np.hstack((X_train_seq.reshape(-1, 1), train_data.iloc[:, 2:].values))

# Use integer labels (0 or 1) for y_train
y_train = train_data.iloc[:, 1].values  # Labels in binary form (not categorical)

In [None]:
#modify create model function
def create_model(vocab_size, learning_rate=0.001, filters=32, kernel_size=3, dense_units=128, dropout_rate=0.3):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=50, input_length=1),  # Embedding Layer
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(64, activation='relu'),
        Dense(2, activation='softmax')  # Binary classification
    ])
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

In [None]:
from scikeras.wrappers import KerasClassifier
#Step 2: Pass vocab_size in KerasClassifier
vocab_size = len(label_encoder.classes_)  # Ensure this is defined

keras_clf = KerasClassifier(model=create_model, model__vocab_size=vocab_size, epochs=10, batch_size=32, verbose=0)


In [None]:
from sklearn.model_selection import GridSearchCV
# Define hyperparameter grid
param_grid = {
    'model__learning_rate': [0.001, 0.01],
    'model__filters': [32, 64],
    'model__kernel_size': [3, 5],
    'model__dense_units': [128, 256],
    'model__dropout_rate': [0.3, 0.5],
    'batch_size': [32, 64],  # Training parameter
    'epochs': [20]  # Training parameter
}

# Perform grid search
grid_search = GridSearchCV(estimator=keras_clf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_:.2f}")

In [None]:
#Steps to Use Preprocessed Data with Simple DCNN
#model 1
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(767, 1)),  # Conv1D expects 3D input
    MaxPooling1D(pool_size=2),  # Reduce dimensionality
    Conv1D(filters=32, kernel_size=3, activation='relu'),  # Another Conv1D layer
    MaxPooling1D(pool_size=2),
    Flatten(),  # Flatten to feed into Dense layers
    Dense(128, activation='relu'),  # Fully connected layer
    Dropout(0.3),  # Regularization
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',  # Binary classification
              metrics=['accuracy'])

print(model.summary())

In [None]:
print(f"X_train shape: {X_train.shape}")  # Should be (448826, 767)
print(f"X_train total size: {X_train.size}")  # Should be 448826 * 767

In [None]:
# Check sequence and numeric features
print(f"X_train_seq shape: {X_train_seq.shape}")  # Expected (448826, 1)
print(f"X_train_numeric shape: {X_train_numeric.shape}")  # Expected (448826, 766)

# Combine sequence and numeric features
X_train = np.hstack((X_train_seq, X_train_numeric))
X_test = np.hstack((X_test_seq, X_test_numeric))
X_eval = np.hstack((X_eval_seq, X_eval_numeric))

print(f"Combined X_train shape: {X_train.shape}")  # Should be (448826, 767)

In [None]:
# Reshape data for Conv1D
X_train = X_train.reshape(-1, 767, 1)  # Each sample has 767 timesteps, 1 feature per timestep
X_test = X_test.reshape(-1, 767, 1)
X_eval = X_eval.reshape(-1, 767, 1)

print(f"Reshaped X_train shape: {X_train.shape}")  # Should be (448826, 767, 1)
print(f"Reshaped X_test shape: {X_test.shape}")    # Should be (83649, 767, 1)
print(f"Reshaped X_eval shape: {X_eval.shape}")    # Should be (83649, 767, 1)

In [None]:
# Extract labels (second column) from each dataset
y_train = train_data.iloc[:, 1].values  # Binary labels for train data
y_test = test_data.iloc[:, 1].values    # Binary labels for test data
y_eval = eval_data.iloc[:, 1].values    # Binary labels for eval data

print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"y_eval shape: {y_eval.shape}")


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=64,  # Adjust batch size as needed
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Evaluate on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Evaluate on external evaluation data
eval_loss, eval_accuracy = model.evaluate(X_eval, y_eval, verbose=0)
print(f"External Validation Accuracy: {eval_accuracy * 100:.2f}%")


In [None]:
#model 2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(767, 1)),  # Conv1D expects 3D input
    MaxPooling1D(pool_size=2),  # Reduce dimensionality
    Conv1D(filters=32, kernel_size=3, activation='relu'),  # Another Conv1D layer
    MaxPooling1D(pool_size=2),
    Flatten(),  # Flatten to feed into Dense layers
    Dense(128, activation='relu'),  # Fully connected layer
    Dropout(0.3),  # Regularization
    Dense(128, activation='relu'),
    Dropout(0.3),  # Regularization
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',  # Binary classification
              metrics=['accuracy'])

print(model.summary())

# Check sequence and numeric features
print(f"X_train_seq shape: {X_train_seq.shape}")  # Expected (448826, 1)
print(f"X_train_numeric shape: {X_train_numeric.shape}")  # Expected (448826, 766)

# Combine sequence and numeric features
X_train = np.hstack((X_train_seq, X_train_numeric))
X_test = np.hstack((X_test_seq, X_test_numeric))
X_eval = np.hstack((X_eval_seq, X_eval_numeric))

print(f"Combined X_train shape: {X_train.shape}")  # Should be (448826, 767)
# Reshape data for Conv1D
X_train = X_train.reshape(-1, 767, 1)  # Each sample has 767 timesteps, 1 feature per timestep
X_test = X_test.reshape(-1, 767, 1)
X_eval = X_eval.reshape(-1, 767, 1)

print(f"Reshaped X_train shape: {X_train.shape}")  # Should be (448826, 767, 1)
print(f"Reshaped X_test shape: {X_test.shape}")    # Should be (83649, 767, 1)
print(f"Reshaped X_eval shape: {X_eval.shape}")    # Should be (83649, 767, 1)

# Extract labels (second column) from each dataset
y_train = train_data.iloc[:, 1].values  # Binary labels for train data
y_test = test_data.iloc[:, 1].values    # Binary labels for test data
y_eval = eval_data.iloc[:, 1].values    # Binary labels for eval data

print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"y_eval shape: {y_eval.shape}")

from tensorflow.keras.callbacks import EarlyStopping

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=64,  # Adjust batch size as needed
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Evaluate on external evaluation data
eval_loss, eval_accuracy = model.evaluate(X_eval, y_eval, verbose=0)
print(f"External Validation Accuracy: {eval_accuracy * 100:.2f}%")

In [None]:
# Save the trained model in HDF5 format
model.save("dcnn_epitope_classifier_bcell_model2.h5")
print("Model saved as 'dcnn_epitope_classifier_bcell_model2.h5'")

In [None]:
from sklearn.metrics import confusion_matrix

# Generate binary predictions
y_test_pred = (model.predict(X_test) > 0.5).astype(int).ravel()
y_eval_pred = (model.predict(X_eval) > 0.5).astype(int).ravel()

# Confusion matrices
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()
tn_eval, fp_eval, fn_eval, tp_eval = confusion_matrix(y_eval, y_eval_pred).ravel()

# Sensitivity and Specificity calculations for Test Set
sensitivity_test = tp_test / (tp_test + fn_test)
specificity_test = tn_test / (tn_test + fp_test)
print(f"Test Sensitivity: {sensitivity_test:.2f}")
print(f"Test Specificity: {specificity_test:.2f}")

# Sensitivity and Specificity calculations for Validation Set
sensitivity_eval = tp_eval / (tp_eval + fn_eval)
specificity_eval = tn_eval / (tn_eval + fp_eval)
print(f"Validation Sensitivity: {sensitivity_eval:.2f}")
print(f"Validation Specificity: {specificity_eval:.2f}")

In [None]:
from sklearn.metrics import classification_report

# Classification report for Test Set
print("\nTest Data - Classification Report:")
print(classification_report(y_test, y_test_pred))

# Classification report for Validation Set
print("\nValidation Data - Classification Report:")
print(classification_report(y_eval, y_eval_pred))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Predict probabilities for ROC curve
y_test_prob = model.predict(X_test).ravel()  # Test probabilities
y_eval_prob = model.predict(X_eval).ravel()  # Validation probabilities

# Compute ROC curves
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr_test, tpr_test)

fpr_eval, tpr_eval, thresholds_eval = roc_curve(y_eval, y_eval_prob)
roc_auc_eval = auc(fpr_eval, tpr_eval)

# Print the values
print("Test Data - ROC Curve Values")
print("False Positive Rate (FPR):", fpr_test)
print("True Positive Rate (TPR):", tpr_test)
print("Thresholds:", thresholds_test)
print(f"ROC AUC: {roc_auc_test:.2f}\n")

print("Validation Data - ROC Curve Values")
print("False Positive Rate (FPR):", fpr_eval)
print("True Positive Rate (TPR):", tpr_eval)
print("Thresholds:", thresholds_eval)
print(f"ROC AUC: {roc_auc_eval:.2f}\n")

# Plot the ROC curves
plt.figure(figsize=(12, 6))

# Test ROC Curve
plt.subplot(1, 2, 1)
plt.plot(fpr_test, tpr_test, color='blue', lw=2, label=f'ROC AUC = {roc_auc_test:.2f}')
plt.plot([0, 1], [0, 1], color='black', linestyle='--', label='Random Classifier (AUC = 0.50)')
plt.title('Test Data - ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')

# Validation ROC Curve
plt.subplot(1, 2, 2)
plt.plot(fpr_eval, tpr_eval, color='green', lw=2, label=f'ROC AUC = {roc_auc_eval:.2f}')
plt.plot([0, 1], [0, 1], color='black', linestyle='--', label='Random Classifier (AUC = 0.50)')
plt.title('Validation Data - ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')

plt.tight_layout()
plt.savefig("test_validation_roc_auc_bcell.png", dpi=500)
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Test Data
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr_test, tpr_test)

# External Validation Data
fpr_eval, tpr_eval, _ = roc_curve(y_eval, y_eval_prob)
roc_auc_eval = auc(fpr_eval, tpr_eval)

print(f"Test ROC AUC: {roc_auc_test:.2f}")
print(f"External Validation ROC AUC: {roc_auc_eval:.2f}")


In [None]:
from sklearn.metrics import confusion_matrix

# Confusion matrix for test data
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()

# Confusion matrix for evaluation data
tn_eval, fp_eval, fn_eval, tp_eval = confusion_matrix(y_eval, y_eval_pred).ravel()

# Print the confusion matrices to verify
print("Test Confusion Matrix:")
print(f"TN: {tn_test}, FP: {fp_test}, FN: {fn_test}, TP: {tp_test}")

print("\nExternal Validation Confusion Matrix:")
print(f"TN: {tn_eval}, FP: {fp_eval}, FN: {fn_eval}, TP: {tp_eval}")

In [None]:
sensitivity_test = tp_test / (tp_test + fn_test)
specificity_test = tn_test / (tn_test + fp_test)

print(f"Test Sensitivity: {sensitivity_test:.2f}")
print(f"Test Specificity: {specificity_test:.2f}")

sensitivity_eval = tp_eval / (tp_eval + fn_eval)
specificity_eval = tn_eval / (tn_eval + fp_eval)

print(f"External Validation Sensitivity: {sensitivity_eval:.2f}")
print(f"External Validation Specificity: {specificity_eval:.2f}")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import clone_model

# Initialize Stratified K-Fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics
accuracies = []
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# Cross-validation loop
for i, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    print(f"Training fold {i+1}...")
    
    # Clone the model for each fold to avoid state sharing
    fold_model = clone_model(model)
    fold_model.compile(optimizer=Adam(learning_rate=0.0001), 
                       loss='binary_crossentropy', 
                       metrics=['accuracy'])
    
    # Train on the training fold
    fold_model.fit(X_train[train_idx], y_train[train_idx], epochs=20, batch_size=64, verbose=0)
    
    # Evaluate on the test fold
    y_prob = fold_model.predict(X_train[test_idx]).ravel()
    y_pred = (y_prob > 0.5).astype(int)
    acc = accuracy_score(y_train[test_idx], y_pred)
    accuracies.append(acc)
    
    # Compute ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_train[test_idx], y_prob)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label=f'Fold {i+1} (AUC = {roc_auc:.2f})')

# Plot ROC curve
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, label=f'Mean ROC (AUC = {mean_auc:.2f})', lw=2)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for 10-Fold Cross-Validation')
plt.legend(loc="lower right")
plt.savefig("cv_roc_curve_dcnn_bcell.png", dpi=500)
plt.show()

# Print mean accuracy and AUC
print(f"Mean 10-Fold Accuracy: {np.mean(accuracies):.4f}")
print(f"Mean AUC: {mean_auc:.4f}")

In [None]:
# Save cross-validation accuracies to a text file
with open('cv_accuracies_dcnn.txt', 'w') as f:
    f.write("Cross-Validation Accuracies for each fold:\n")
    for i, acc in enumerate(accuracies):
        f.write(f"Fold {i+1} Accuracy: {acc:.4f}\n")
    f.write(f"\nMean 10-Fold Accuracy: {np.mean(accuracies):.4f}\n")

print("Accuracy of each fold has been saved to 'cv_accuracies_dcnn_bcell.txt'")