In [None]:
import os

# Replace this with your desired path
os.chdir("D:XGB_DS")

# Optional: confirm change
print("Current Working Directory:", os.getcwd())

In [None]:
#loading the libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import itertools
from imblearn.over_sampling import BorderlineSMOTE
from collections import Counter
from tensorflow.keras.layers import Input, GRU

In [None]:
#loading the data
train_data = pd.read_csv("train_B_data.csv")
test_data = pd.read_csv("test_B_data.csv")
eval_data = pd.read_csv("external_eval_B_data.csv")

X_seq_train_raw = train_data['Epitopes'].tolist()
X_seq_test_raw = test_data['Epitopes'].tolist()
X_seq_eval_raw = eval_data['Epitopes'].tolist()

y_train = train_data['Label'].values
y_test = test_data['Label'].values
y_eval = eval_data['Label'].values

In [None]:
from collections import Counter
print("Original label distribution in train data:", Counter(y_train))

In [None]:
# === Step 1: Define Blosum62 Matrix ===
blosum62 = {
    'A':[4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0],
    'R':[-1,5,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3],
    'N':[-2,0,6,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3],
    'D':[-2,-2,1,6,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3],
    'C':[0,-3,-3,-3,9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1],
    'Q':[-1,1,0,0,-3,5,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2],
    'E':[-1,0,0,2,-4,2,5,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2],
    'G':[0,-2,0,-1,-3,-2,-2,6,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3],
    'H':[-2,0,1,-1,-3,0,0,-2,8,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3],
    'I':[-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3],
    'L':[-1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1],
    'K':[-1,2,0,-1,-3,1,1,-2,-1,-3,-2,5,-1,-3,-1,0,-1,-3,-2,-2],
    'M':[-1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,5,0,-2,-1,-1,-1,-1,1],
    'F':[-2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,6,-4,-2,-2,1,3,-1],
    'P':[-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,7,-1,-1,-4,-3,-2],
    'S':[1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2],
    'T':[0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,5,-2,-2,0],
    'W':[-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,11,2,-3],
    'Y':[-2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,7,-1],
    'V':[0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4]

In [None]:
# Compute length of each epitope in the training set
epitope_lengths = [len(seq) for seq in X_seq_train_raw]

# Get the maximum length
max_length = max(epitope_lengths)

print("Maximum epitope length in training data:", max_length)

# Compute maximum epitope length in test data
test_lengths = [len(seq) for seq in X_seq_test_raw]
max_test_len = max(test_lengths)
print("Maximum epitope length in test data:", max_test_len)

# Compute maximum epitope length in external evaluation data
eval_lengths = [len(seq) for seq in X_seq_eval_raw]
max_eval_len = max(eval_lengths)
print("Maximum epitope length in external eval data:", max_eval_len)

In [None]:
# === Step 2: Encode sequences ===
def blosum_encode_sequence(seq):
    return np.array([blosum62.get(aa, [0]*20) for aa in seq])

def encode_dataset(seq_list, max_len):
    encoded = []
    for seq in seq_list:
        encoded_seq = blosum_encode_sequence(seq)
        pad_width = ((0, max_len - len(seq)), (0, 0))
        padded = np.pad(encoded_seq, pad_width, mode='constant')
        encoded.append(padded)
    return np.array(encoded)

max_len = max(len(seq) for seq in X_seq_train_raw)
X_train_encoded = encode_dataset(X_seq_train_raw, max_len)

In [None]:
print("Encoded training shape:", X_train_encoded.shape)

In [None]:
print("One encoded epitope shape:", X_train_encoded[0].shape)
print(X_train_encoded[0])  # will show a matrix

In [None]:
# === Step 1: Flatten the 3D array to 2D for SMOTE ===
X_train_flat = X_train_encoded.reshape(X_train_encoded.shape[0], -1)  # shape: (225887, 400)

print("Shape before Borderline-SMOTE:", X_train_flat.shape)

In [None]:
# === Step 2: Apply BorderlineSMOTE ===
borderline_smote = BorderlineSMOTE(kind='borderline-1', random_state=42)
X_train_resampled, y_train_resampled = borderline_smote.fit_resample(X_train_flat, y_train)

print("After Borderline-SMOTE:", Counter(y_train_resampled))
print("Shape after Borderline-SMOTE:", X_train_resampled.shape)

In [None]:
print(X_train_resampled[0].shape)

In [None]:
# === Step 3: Reshape back to 3D for RNN input ===
X_train_balanced = X_train_resampled.reshape(-1, X_train_encoded.shape[1], X_train_encoded.shape[2])  # shape: (samples, 20, 20)

print("Final balanced input shape:", X_train_balanced.shape)

In [None]:
# Encode test data using training max_len
X_test_encoded = encode_dataset(X_seq_test_raw, max_len)
X_eval_encoded = encode_dataset(X_seq_eval_raw, max_len)

print("Test shape :", X_test_encoded.shape)
print("Eval shape :", X_eval_encoded.shape) # is this 3d?

In [None]:
# === Step 3: Model Builder ===
def build_model(units=64, dropout_rate=0.3, learning_rate=1e-3, optimizer='adam'):
    input_layer = Input(shape=(X_train_balanced.shape[1], X_train_balanced.shape[2]))
    x = GRU(units, return_sequences=True)(input_layer)
    x = GRU(units)(x)
    x = Dense(units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    output_layer = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    opt = tf.keras.optimizers.Adam(learning_rate) if optimizer.lower() == 'adam' \
        else tf.keras.optimizers.RMSprop(learning_rate)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Grid definition
param_grid = {
    'units': [16, 32, 64, 128],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [1e-3, 5e-4],
    'optimizer': ['Adam', 'RMSprop'],
    'batch_size': [32, 64]
}

best_val_acc = 0
best_params = None

# Grid search loop
for params in itertools.product(*param_grid.values()):
    combo = dict(zip(param_grid.keys(), params))
    build_args = {k: v for k, v in combo.items() if k != 'batch_size'}

    model = build_model(**build_args)
    history = model.fit(
        X_train_balanced, y_train_resampled,
        epochs=20,
        batch_size=combo['batch_size'],
        validation_split=0.2,
        verbose=2,
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
    )

    val_acc = max(history.history['val_accuracy'])
    print(f"Tested combo {combo} — max val_accuracy: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_params = combo

print("\n✅ Best Hyperparameters Found:", best_params)

In [None]:
# Rebuild with best params
build_args = {k: v for k, v in best_params.items() if k != 'batch_size'}
best_batch_size = best_params['batch_size']
final_model = build_model(**build_args)

# Retrain on full training data
final_model.fit(
    X_train_balanced, y_train_resampled,
    epochs=300,
    batch_size=best_batch_size,
    verbose=2,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)]
)

# Save model
final_model.save('best_model_gru_blosum.keras')
print("\n Final trained model saved as 'best_model_gru_blosum.keras'")

In [None]:
# ==========================
# PREDICTION + EVALUATION
# ==========================

def evaluate_metrics(y_true, y_probs, dataset_name):
    y_pred = (y_probs > 0.5).astype("int32")
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_probs)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    print(f"\n📊 {dataset_name}")
    print(f"Accuracy    : {acc:.4f}")
    print(f"F1 Score    : {f1:.4f}")
    print(f"Recall      : {recall:.4f}")
    print(f"Specificity : {specificity:.4f}")
    print(f"Precision   : {precision:.4f}")
    print(f"ROC AUC     : {roc_auc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# --- Predict on Test and External Validation Sets ---
y_test_probs = final_model.predict(X_test_encoded).flatten()
y_eval_probs = final_model.predict(X_eval_encoded).flatten()

# --- Evaluate ---
evaluate_metrics(y_test, y_test_probs, "Test Set")
evaluate_metrics(y_eval, y_eval_probs, "External Validation Set")

In [None]:
# ==========================
# 10-FOLD CROSS-VALIDATION ON FINAL MODEL
# ==========================

print("\n✅ Performing 10-Fold Cross-Validation on Final Model...")
cv_scores = []
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_balanced, y_train_resampled), 1):
    X_train_cv, X_val_cv = X_train_balanced[train_idx], X_train_balanced[val_idx]
    y_train_cv, y_val_cv = y_train_resampled[train_idx], y_train_resampled[val_idx]

    model_cv = build_model(**build_args)
    model_cv.fit(
        X_train_cv, y_train_cv,
        epochs=30,
        batch_size=best_batch_size,
        verbose=2,
        validation_data=(X_val_cv, y_val_cv),
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
    )

    val_loss, val_acc = model_cv.evaluate(X_val_cv, y_val_cv, verbose=0)
    cv_scores.append(val_acc)
    print(f"Fold {fold}: Validation Accuracy = {val_acc:.4f}")

print("\n✅ 10-Fold Cross-Validation Complete")
print("Fold Accuracies:", cv_scores)
print("Mean Validation Accuracy:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))