In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.utils import class_weight, shuffle, resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, cohen_kappa_score, precision_score, recall_score, precision_recall_curve, auc, roc_auc_score, make_scorer

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import Recall
from tensorflow.keras.metrics import Metric
#from focal_loss import SparseCategoricalFocalLoss
from tensorflow.keras.losses import Loss
from keras.losses import BinaryFocalCrossentropy
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
# LOADING DATA
def load_sample_data(var_path, ref_path, target_path):
    var_df = pd.read_csv(var_path, index_col = 0)   # variant reads
    ref_df = pd.read_csv(ref_path, index_col = 0)   # reference reads
    target_df = pd.read_csv(target_path)            # target labels

    # handling missing values
    var_df.dropna(axis = 1, how = 'any', inplace = True)
    ref_df.dropna(axis = 1, how = 'any', inplace = True)

    assert var_df.index.equals(ref_df.index) and var_df.columns.equals(ref_df.columns)

    # filtering out the SNVs where MutationType is -1 (neither class)
    snvs_to_remove = target_df[target_df['MutationType'] == -1]['SNV']
    var_df_filtered = var_df[~var_df.index.isin(snvs_to_remove)]
    ref_df_filtered = ref_df[~ref_df.index.isin(snvs_to_remove)]
    target_df_filtered = target_df[~target_df['SNV'].isin(snvs_to_remove)]

    print('var_filt shape:', var_df_filtered.shape, 'ref_filt shape:', ref_df_filtered.shape, 'target_filt shape:', target_df_filtered.shape)

    return var_df_filtered, ref_df_filtered, target_df_filtered['MutationType'].values

    #return var_df, ref_df, target_df['gsr'].values    # 'MutationType' if data is of nanopore, else 'gsr'

In [None]:
# DATA PREPROCESSING (WITHOUT BASE FEATURES)

def preprocess_data(var_df, ref_df, y_arr, explained_variance_threshold = 0.70, apply_svd = True, sample_id = None, output_dir = None):
    X_var = var_df.values[:, 1:]  # ignoring the SNV index
    X_ref = ref_df.values[:, 1:]
    y_arr = y_arr.astype(int)

    # converting classes to binary
    y_arr[y_arr == 1] = 0  # DNA class
    y_arr[y_arr == 2] = 1  # RNA class

    # combining variant and reference read data
    combined_data = np.hstack((X_var, X_ref, y_arr.reshape(-1, 1)))
    combined_data_shuffled = shuffle(combined_data)

    # re-splitting data into respective sets
    X_var_shuffled = combined_data_shuffled[:, :X_var.shape[1]]
    X_ref_shuffled = combined_data_shuffled[:, X_var.shape[1] : X_var.shape[1] + X_ref.shape[1]]
    y_shuffled = combined_data_shuffled[:, -1].astype(int)

    # 60-20-20 train-val-test split
    X_var_temp, X_var_test, X_ref_temp, X_ref_test, y_temp, y_test = train_test_split(X_var_shuffled, X_ref_shuffled,
                                                                                      y_shuffled, test_size = 0.2, random_state = 1, stratify = y_shuffled)

    X_var_train, X_var_val, X_ref_train, X_ref_val, y_train, y_val = train_test_split(X_var_temp, X_ref_temp,
                                                                                      y_temp, test_size = 0.25, random_state = 1, stratify = y_temp)

    # scaling the data
    scaler_var = StandardScaler()
    X_var_train_scaled = scaler_var.fit_transform(X_var_train)
    X_var_val_scaled = scaler_var.transform(X_var_val)
    X_var_test_scaled = scaler_var.transform(X_var_test)

    scaler_ref = StandardScaler()
    X_ref_train_scaled = scaler_ref.fit_transform(X_ref_train)
    X_ref_val_scaled = scaler_ref.transform(X_ref_val)
    X_ref_test_scaled = scaler_ref.transform(X_ref_test)

    # combining scaled data
    X_train_combined = np.hstack((X_var_train_scaled, X_ref_train_scaled))
    X_val_combined = np.hstack((X_var_val_scaled, X_ref_val_scaled))
    X_test_combined = np.hstack((X_var_test_scaled, X_ref_test_scaled))

    if apply_svd:
        # dimensionality reduction using TruncatedSVD
        initial_svd = TruncatedSVD(n_components = min(X_train_combined.shape[1], 1000))  # generally, ~ 95% variance is explained with < 1000 components
        X_train_svd = initial_svd.fit_transform(X_train_combined)

        cumulative_variance = np.cumsum(initial_svd.explained_variance_ratio_)
        n_comp = np.argmax(cumulative_variance >= explained_variance_threshold) + 1    # no. of components explaining the threshold variance

        svd_optimal = TruncatedSVD(n_components = n_comp)
        X_train_svd = svd_optimal.fit_transform(X_train_combined)
        X_val_svd = svd_optimal.transform(X_val_combined)
        X_test_svd = svd_optimal.transform(X_test_combined)

        if output_dir and sample_id:
            plt.figure(figsize = (7, 5))
            plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker = 'o', linestyle = '--')
            plt.xlabel('Number of components')
            plt.ylabel('Cumulative explained variance')
            plt.title(f'TruncatedSVD explained variance - {sample_id}')
            plt.axvline(n_comp, color = 'r', linestyle = '--',
                        label = f'{round(explained_variance_threshold * 100)}% variance ({n_comp} components)')
            plt.legend()
            plt_path = os.path.join(output_dir, f'{sample_id}_svd_explained_variance.png')
            plt.savefig(plt_path, dpi = 500)
            plt.close()

        X_train_combined_final = X_train_svd
        X_val_combined_final = X_val_svd
        X_test_combined_final = X_test_svd
    else:
        # If not applying SVD, just pass the scaled combined data forward
        X_train_combined_final = X_train_combined
        X_val_combined_final = X_val_combined
        X_test_combined_final = X_test_combined
        n_comp = None

    return X_train_combined_final, X_val_combined_final, X_test_combined_final, y_train, y_val, y_test, n_comp

In [None]:
# class weights function  (either use this or SMOTE oversampling)
def compute_class_weights(y_train):
    class_weights = class_weight.compute_class_weight(class_weight = 'balanced',
                                                      classes = np.unique(y_train),
                                                      y = y_train)
    return dict(enumerate(class_weights))

In [None]:
def focal_loss(alpha=0.25, gamma=2.0):
    def loss(y_true, y_pred):
        # Clip predictions to avoid log(0) error
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())

        # Calculate focal loss
        pt = y_true * y_pred + (1 - y_true) * (1 - y_pred)  # Probabilities of true class
        fl = -alpha * (1 - pt) ** gamma * K.log(pt)
        return K.mean(fl, axis=0)
    return loss

In [None]:
# Fixed Neural Network Architecture (back and forth between shallow and deeper network)

# def build_fixed_model(input_shape):
#     model = Sequential()
#     model.add(layers.Input(shape = (input_shape,)))

#     model.add(layers.Dense(800, activation = 'relu'))
#     model.add(layers.BatchNormalization())
#     model.add(layers.Dropout(0.3))

#     model.add(layers.Dense(400, activation = 'relu'))
#     model.add(layers.BatchNormalization())
#     model.add(layers.Dropout(0.3))

#     model.add(layers.Dense(200, activation = 'relu'))
#     model.add(layers.BatchNormalization())
#     model.add(layers.Dropout(0.3))

#     model.add(layers.Dense(1, activation = 'sigmoid'))

#     model.compile(
#         optimizer = Adam(learning_rate = 0.001),
#         loss = 'binary_crossentropy',
#         metrics = [
#             tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
#             tf.keras.metrics.Precision(name = 'precision'),
#             tf.keras.metrics.Recall(name = 'recall'),
#             tf.keras.metrics.AUC(name = 'auc'),
#         ]
#     )

#     return model


def build_fixed_model(input_shape):
    model = Sequential()
    model.add(layers.Input(shape = (input_shape,)))

    model.add(layers.Dense(3200, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(1600, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(800, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(400, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(200, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(100, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(1, activation = 'sigmoid'))


    model.compile(
        optimizer = Adam(learning_rate = 0.001),
        loss = focal_loss(alpha = 0.25, gamma = 2.0),  # using Focal Loss
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
            tf.keras.metrics.Precision(name = 'precision'),
            tf.keras.metrics.Recall(name = 'recall'),
            tf.keras.metrics.AUC(name = 'auc'),
        ]
    )

    return model

In [None]:
def evaluate_model(model, X_test, y_test, sample_id, output_dir):
    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int)

    # evaluation metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average = 'macro')
    precision_macro = precision_score(y_test, y_pred, average = 'macro')
    recall_macro = recall_score(y_test, y_pred, average = 'macro')
    roc_auc = roc_auc_score(y_test, y_pred_probs)
    kappa = cohen_kappa_score(y_test, y_pred)

    plt.figure(figsize = (7, 5))
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = 'Blues',
                xticklabels = ['DNA', 'RNA'], yticklabels = ['DNA', 'RNA'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {sample_id}')
    image_path = os.path.join(output_dir, f'{sample_id}_conf_mat.png')
    plt.savefig(image_path, dpi = 300)
    plt.close()

    return f1_macro, kappa, precision_macro, recall_macro, roc_auc, conf_matrix

In [None]:
# Main Execution
# base_dir = "C:\\Users\\tusha\\Downloads\\ML_dna_rna"
output_dir_base = "nn_binary_results_focal/nn_expanded_architecture_focal"

# if not os.path.exists(output_dir_base):
#     os.makedirs(output_dir_base)

sample_ids = ['nanopore_SRR21492154', 'nanopore_SRR21492155', 'nanopore_SRR21492156',
             'nanopore_SRR21492157', 'nanopore_SRR21492158', 'nanopore_SRR21492159']

explained_variance_thresholds = [0.95]

all_results = []

for sample_id in sample_ids:
    #var_path = os.path.join(base_dir, f"{sample_id}_varreads.csv")
    #ref_path = os.path.join(base_dir, f"{sample_id}_refreads.csv")
    var_path = f"{sample_id}_varreads.csv"
    ref_path = f"{sample_id}_refreads.csv"
    target_path = f"{sample_id}_targets.csv"
    print(f"\nProcessing sample: {sample_id}")

    # load data
    var_df, ref_df, y_arr = load_sample_data(var_path, ref_path, target_path)

    for explained_variance_threshold in explained_variance_thresholds + [None]:
        if explained_variance_threshold is None:
            output_dir = os.path.join(output_dir_base, "no_svd")
        else:
            output_dir = os.path.join(output_dir_base, f"svd_{int(explained_variance_threshold * 100)}_variance")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        #print(f"\nRunning pipeline with TruncatedSVD at {int(explained_variance_threshold * 100) if explained_variance_threshold else 'No'}% explained variance")

        # preprocessing call
        # X_train_combined_final, X_val_combined_final, X_test_combined_final, y_train, y_val, y_test, n_comp = preprocess_data(
        #     var_df, ref_df, y_arr, explained_variance_threshold=explained_variance_threshold, apply_svd=bool(explained_variance_threshold),
        #     sample_id=sample_id, output_dir=output_dir)
        X_train_combined_final, X_val_combined_final, X_test_combined_final, y_train, y_val, y_test, _ = preprocess_data(var_df, ref_df, y_arr, explained_variance_threshold = None,
                                                                                                                         apply_svd = False, sample_id = sample_id, output_dir = output_dir)
        y_train = y_train.astype(int)
        y_val = y_val.astype(int)
        y_test = y_test.astype(int)


        # model training and evaluation calls
        model = build_fixed_model(X_train_combined_final.shape[1])

        callbacks = [EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True),
                     TensorBoard(log_dir = os.path.join(output_dir, f"logs/{sample_id}"))]

        model.fit(X_train_combined_final, y_train,
                  validation_data = (X_val_combined_final, y_val),
                  epochs = 50, callbacks = callbacks, verbose = 2)

        f1_macro, kappa, precision_macro, recall_macro, roc_auc, conf_matrix = evaluate_model(model, X_test_combined_final, y_test, sample_id, output_dir)

        all_results.append({
            'sample_id': sample_id,
            'explained_variance_threshold': explained_variance_threshold if explained_variance_threshold else 'No SVD',
            #'n_comp': n_comp,
            'f1_macro': f1_macro,
            'kappa': kappa,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'roc_auc': roc_auc,
        })

results_df = pd.DataFrame(all_results)
results_csv_path = os.path.join(output_dir_base, 'nn_expanded_architecture_results.csv')  # back and forth between shallow and deeper network
results_df.to_csv(results_csv_path, index = False)
print("All results saved to CSV.")


Processing sample: nanopore_SRR21492154
var_filt shape: (5117, 1192) ref_filt shape: (5117, 1192) target_filt shape: (5117, 2)
Epoch 1/50
96/96 - 29s - 304ms/step - accuracy: 0.7058 - auc: 0.6814 - loss: 0.0757 - precision: 0.3386 - recall: 0.5429 - val_accuracy: 0.2559 - val_auc: 0.3063 - val_loss: 0.4349 - val_precision: 0.1723 - val_recall: 0.7437
Epoch 2/50
96/96 - 38s - 391ms/step - accuracy: 0.7937 - auc: 0.7293 - loss: 0.0471 - precision: 0.4675 - recall: 0.4588 - val_accuracy: 0.8281 - val_auc: 0.7319 - val_loss: 0.0343 - val_precision: 0.5762 - val_recall: 0.4372
Epoch 3/50
96/96 - 20s - 212ms/step - accuracy: 0.8286 - auc: 0.7946 - loss: 0.0385 - precision: 0.5628 - recall: 0.5193 - val_accuracy: 0.8574 - val_auc: 0.8848 - val_loss: 0.0223 - val_precision: 0.7227 - val_recall: 0.4322
Epoch 4/50
96/96 - 21s - 218ms/step - accuracy: 0.8276 - auc: 0.8211 - loss: 0.0340 - precision: 0.5623 - recall: 0.5008 - val_accuracy: 0.8838 - val_auc: 0.9039 - val_loss: 0.0245 - val_precisi



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 160ms/step

Processing sample: nanopore_SRR21492156
var_filt shape: (2888, 3836) ref_filt shape: (2888, 3836) target_filt shape: (2888, 2)
Epoch 1/50
55/55 - 40s - 721ms/step - accuracy: 0.6478 - auc: 0.6830 - loss: 0.0846 - precision: 0.2993 - recall: 0.6119 - val_accuracy: 0.8270 - val_auc: 0.7400 - val_loss: 0.0884 - val_precision: 0.9286 - val_recall: 0.1161
Epoch 2/50
55/55 - 41s - 738ms/step - accuracy: 0.7535 - auc: 0.7421 - loss: 0.0469 - precision: 0.4030 - recall: 0.5701 - val_accuracy: 0.3841 - val_auc: 0.4952 - val_loss: 0.2105 - val_precision: 0.1935 - val_recall: 0.6875
Epoch 3/50
55/55 - 33s - 603ms/step - accuracy: 0.8112 - auc: 0.8236 - loss: 0.0357 - precision: 0.5101 - recall: 0.6060 - val_accuracy: 0.8772 - val_auc: 0.8210 - val_loss: 0.0315 - val_precision: 0.7470 - val_recall: 0.5536
Epoch 4/50
55/55 - 40s - 730ms/step - accuracy: 0.8718 - auc: 0.8962 - loss: 0.0270 - precision: 0.6540 - recall: 0.7164 



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 63ms/step
Epoch 1/50
55/55 - 39s - 702ms/step - accuracy: 0.6501 - auc: 0.6677 - loss: 0.0840 - precision: 0.3010 - recall: 0.6119 - val_accuracy: 0.3080 - val_auc: 0.6470 - val_loss: 0.4386 - val_precision: 0.2131 - val_recall: 0.9554
Epoch 2/50
55/55 - 43s - 773ms/step - accuracy: 0.7714 - auc: 0.7447 - loss: 0.0466 - precision: 0.4289 - recall: 0.5493 - val_accuracy: 0.4983 - val_auc: 0.7619 - val_loss: 0.0970 - val_precision: 0.2694 - val_recall: 0.9286
Epoch 3/50
55/55 - 39s - 709ms/step - accuracy: 0.8308 - auc: 0.8315 - loss: 0.0367 - precision: 0.5536 - recall: 0.6478 - val_accuracy: 0.8754 - val_auc: 0.8905 - val_loss: 0.0301 - val_precision: 0.7564 - val_recall: 0.5268
Epoch 4/50
55/55 - 41s - 745ms/step - accuracy: 0.8759 - auc: 0.8814 - loss: 0.0283 - precision: 0.6734 - recall: 0.6955 - val_accuracy: 0.8997 - val_auc: 0.9096 - val_loss: 0.0199 - val_precision: 0.8553 - val_recall: 0.5804
Epoch 5/50
55/55 - 40s