In [None]:
# # Deep Learning Model for Cancer vs. Healthy Detection Using Features Extracted from cfDNA Fragment Size 

# ## 1. Setup and Imports

# This section imports all necessary libraries for reproducibility.

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import BatchNormalization, Activation, Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from collections import Counter
from sklearn.calibration import calibration_curve
import umap
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import random

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# Define output folder for plots and logs
output_folder = 'results/temp' # For DELFI based results
os.makedirs(output_folder, exist_ok=True)

# Log file path
log_file_path = os.path.join(output_folder, 'prediction_results_all.txt') # Specific log for DELFI


In [None]:
# ## 2. Model Architecture

# This function defines the architecture of the deep neural network. It includes Dense layers, Batch Normalization, ReLU activation, and Dropout for regularization. L2 regularization is also applied to the Dense layers.

def build_model(input_dim):
    """
    Builds a sequential Keras model for binary classification.

    Args:
        input_dim (int): The number of input features.

    Returns:
        tf.keras.Model: The compiled Keras model.
    """
    inputs = Input(shape=(input_dim,))
    # First Dense Layer
    x = Dense(512, kernel_regularizer=l2(0.013))(inputs)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    # Second Dense Layer
    x = Dense(256, kernel_regularizer=l2(0.013))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    # Third Dense Layer
    x = Dense(128, kernel_regularizer=l2(0.013))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    # Output Layer
    outputs = Dense(1, activation='sigmoid')(x) # Sigmoid for binary classification
    model = Model(inputs=inputs, outputs=outputs)
    
    # Compile the model with Adam optimizer, binary crossentropy loss, and AUC/accuracy metrics
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['AUC', 'binary_accuracy'])
    return model

In [None]:
# ## 3. Helper Functions for Evaluation and Plotting

# This section defines functions to evaluate the model's performance and to plot confusion matrices.

def evaluate_and_log(model, X, y, set_name, log_file):
    """
    Evaluates the model's performance on a given dataset and logs the results.

    Args:
        model (tf.keras.Model): The trained Keras model.
        X (np.array): Features of the dataset.
        y (np.array): True labels of the dataset.
        set_name (str): Name of the dataset (e.g., "Train", "Validation", "Test").
        log_file (file object): File object to write the logs to.

    Returns:
        tuple: A tuple containing (y_true, y_pred, y_pred_proba, loss, auc, acc, f1_score).
    """
    loss, auc, acc = model.evaluate(X, y, verbose=0)
    y_pred_proba = model.predict(X).flatten()
    y_pred = np.round(y_pred_proba)

    # Calculate F1 score
    f1 = f1_score(y, y_pred)

    print(f"--- {set_name} Results ---")
    print(f"Loss: {loss:.4f}, AUC: {auc:.4f}, Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

    # Log results
    seed_val = getattr(log_file, 'seed', 'N/A') # Access seed from the file object attributes
    log_file.write(f"{seed_val}\t{set_name}\tLoss\t{loss:.4f}\n")
    log_file.write(f"{seed_val}\t{set_name}\tAUC\t{auc:.4f}\n")
    log_file.write(f"{seed_val}\t{set_name}\tAccuracy\t{acc:.4f}\n")
    log_file.write(f"{seed_val}\t{set_name}\tF1_Score\t{f1:.4f}\n")

    return y, y_pred, y_pred_proba, loss, auc, acc, f1 # Return metrics as well


def plot_confusion_matrix(y_true, y_pred, labels, title, filename):
    """
    Plots and saves a confusion matrix.

    Args:
        y_true (np.array): True labels.
        y_pred (np.array): Predicted labels.
        labels (list): List of class labels (e.g., ['Negative', 'Positive']).
        title (str): Title of the plot.
        filename (str): Path to save the plot.
    """
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(8, 8)) # Create figure and axes explicitly for better control
    disp.plot(cmap=plt.cm.Blues, ax=ax, text_kw={'fontsize': 14}, colorbar=True)
    ax.set_title(title, fontsize=16)
    ax.set_xlabel('Predicted Label', fontsize=14)
    ax.set_ylabel('True Label', fontsize=14)
    if len(labels) == 1:
        ax.set_xticks([0])
        ax.set_xticklabels(labels, fontsize=12)
        ax.set_yticks([0])
        ax.set_yticklabels(labels, fontsize=12)
    else:
        ax.tick_params(axis='both', which='major', labelsize=12)
    plt.savefig(filename)
    plt.close()

In [None]:
# ## 4. Data Loading and Preprocessing

# This section loads the sample data from `review_data_sample.csv`, separates features and labels, and then prepares lists to store evaluation metrics across multiple runs.

# Ensure `review_data_sample.csv` is in the same directory as this notebook.
# If not, please provide the correct path to the file.

# Load the sample data
try:
    sample_data = pd.read_csv('review_data_sample.csv')
except FileNotFoundError:
    print("Error: 'review_data_sample.csv' not found. Please ensure the file is in the same directory or provide the correct path.")
    exit() # Exit if the data file is not found

# Separate features and labels
labels_encoded_filtered_main = sample_data['label'].values
main_features_filtered = sample_data.drop('label', axis=1).values

print(f"Loaded sample data with shape: {main_features_filtered.shape}")
print(f"Class distribution in loaded data: {pd.Series(labels_encoded_filtered_main).value_counts().to_dict()}")

# Initialize lists to store metrics from each run
all_val_f1_scores, all_val_auc_scores = [], []
all_test_f1_scores, all_test_auc_scores = [], []
all_external_cancer_accuracies = [] # This list seems to be for an external dataset not present in this code.

# Initialize X_train_scaled for potential later use (e.g., for feature names if needed)
X_train_scaled = None

In [None]:
# ## 5. Training and Evaluation Loop

# This is the main loop where the model is trained and evaluated across 10 different random seeds. For each seed:
# 1.  Data is split into training, validation, and test sets.
# 2.  SMOTE (Synthetic Minority Over-sampling Technique) is applied to the training data to handle class imbalance.
# 3.  Features are scaled using `StandardScaler`.
# 4.  The neural network model is built and trained with early stopping.
# 5.  Model performance is evaluated on the original training, validation, and test sets.
# 6.  Results are logged to a file.

with open(log_file_path, 'w') as log_file:
    # Write header to the log file
    log_file.write("Seed\tSet\tMetric\tValue\n")

    # Loop through different random seeds for robust evaluation
    for seed in range(10):
        print(f"\n--- Seed {seed} ---")
        log_file.seed = seed # Attach seed to the file object for logging

        # Split data into training, temporary (for validation/test)
        X_train, X_temp, y_train, y_temp = train_test_split(
            main_features_filtered, labels_encoded_filtered_main,
            test_size=0.3, random_state=seed, stratify=labels_encoded_filtered_main
        )
        # Split temporary data into test and validation sets
        X_test, X_val, y_test, y_val = train_test_split(
            X_temp, y_temp, test_size=0.33, random_state=seed, stratify=y_temp
        )

        # Check for empty splits
        if X_train.shape[0] == 0 or X_val.shape[0] == 0 or X_test.shape[0] == 0:
            print(f"Warning: Seed {seed} resulted in an empty train/val/test set. Skipping this seed.")
            all_val_f1_scores.append(np.nan)
            all_val_auc_scores.append(np.nan)
            all_test_f1_scores.append(np.nan)
            all_test_auc_scores.append(np.nan)
            all_external_cancer_accuracies.append(np.nan)
            continue

        print(f"Class distribution before SMOTE (Train): {Counter(y_train)}")
        # Apply SMOTE to handle class imbalance in training data
        if len(Counter(y_train)) < 2:
            print(f"Warning: Seed {seed} - Training data has only one class before SMOTE. SMOTE will not be applied.")
            X_train_smote, y_train_smote = X_train, y_train
        else:
            smote = SMOTE(random_state=seed)
            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        print(f"Class distribution after SMOTE (Train): {Counter(y_train_smote)}")

        # Standardize features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_smote)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)
        
        # Scale original training data (before SMOTE) for evaluation purposes
        X_train_original_scaled = scaler.transform(X_train)
        y_train_original = y_train

        current_input_dim = X_train_scaled.shape[1]
        print(f"Model input dimension (number of features): {current_input_dim}")

        # Build and compile the model
        model = build_model(current_input_dim)
        # Set up Early Stopping callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0)
        
        # Train the model
        history = model.fit(X_train_scaled, y_train_smote, epochs=200, batch_size=32,
                            validation_data=(X_val_scaled, y_val), verbose=0, callbacks=[early_stopping])

        # Evaluate and log results for different sets
        evaluate_and_log(model, X_train_original_scaled, y_train_original, f"Train_Original_Seed_{seed}", log_file)

        val_y_true, val_y_pred, _, val_loss, val_auc, val_acc, val_f1 = evaluate_and_log(model, X_val_scaled, y_val, f"Validation_Seed_{seed}", log_file)
        all_val_f1_scores.append(val_f1)
        all_val_auc_scores.append(val_auc)

        test_y_true, test_y_pred, _, test_loss, test_auc, test_acc, test_f1 = evaluate_and_log(model, X_test_scaled, y_test, f"Test_Seed_{seed}", log_file)
        all_test_f1_scores.append(test_f1)
        all_test_auc_scores.append(test_auc)

        # Plot Confusion Matrix for Test Set (optional, as it was commented out in original logic, but useful)
        plot_confusion_matrix(test_y_true, test_y_pred, labels=['0', '1'],
                              title=f'Confusion Matrix - Test Set (Seed {seed})',
                              filename=os.path.join(output_folder, f'confusion_matrix_test_seed_{seed}.png'))

In [None]:
# ## 6. Final Reporting

# This section calculates and prints the average F1 scores and AUC scores across all valid seeds for both validation and test sets, providing an overall summary of the model's performance.

# Remove NaNs from scores for averaging, which can occur if a seed was skipped
all_val_f1_scores_cleaned = [s for s in all_val_f1_scores if not np.isnan(s)]
all_val_auc_scores_cleaned = [s for s in all_val_auc_scores if not np.isnan(s)]
all_test_f1_scores_cleaned = [s for s in all_test_f1_scores if not np.isnan(s)]
all_test_auc_scores_cleaned = [s for s in all_test_auc_scores if not np.isnan(s)]
valid_external_accuracies = [acc for acc in all_external_cancer_accuracies if not np.isnan(acc)]


print(f"\n--- Average Validation Results over {len(all_val_f1_scores_cleaned)} valid seeds ---")
if all_val_f1_scores_cleaned:
    print(f"Average F1 Score: {np.mean(all_val_f1_scores_cleaned):.4f} (+/- {np.std(all_val_f1_scores_cleaned):.4f})")
    print(f"Average AUC: {np.mean(all_val_auc_scores_cleaned):.4f} (+/- {np.std(all_val_auc_scores_cleaned):.4f})")
else:
    print("No valid validation results to average.")

print(f"\n--- Average Test Set Results over {len(all_test_f1_scores_cleaned)} valid seeds ---")
if all_test_f1_scores_cleaned:
    print(f"Average F1 Score: {np.mean(all_test_f1_scores_cleaned):.4f} (+/- {np.std(all_test_f1_scores_cleaned):.4f})")
    print(f"Average AUC: {np.mean(all_test_auc_scores_cleaned):.4f} (+/- {np.std(all_test_auc_scores_cleaned):.4f})")
else:
    print("No valid test results to average.")

print(f"\nEvaluation complete. Results logged to {log_file_path}")