In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.utils import class_weight, shuffle, resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, cohen_kappa_score, precision_score, recall_score, precision_recall_curve, auc, roc_auc_score, make_scorer

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import Recall
from tensorflow.keras.metrics import Metric
#from focal_loss import SparseCategoricalFocalLoss
from tensorflow.keras.losses import Loss
from keras.losses import BinaryFocalCrossentropy
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
# LOADING DATA
def load_sample_data(var_path, ref_path, target_path):
    var_df = pd.read_csv(var_path, index_col = 0)   # variant reads
    ref_df = pd.read_csv(ref_path, index_col = 0)   # reference reads
    target_df = pd.read_csv(target_path)            # target labels

    # handling missing values
    var_df.dropna(axis = 1, how = 'any', inplace = True)
    ref_df.dropna(axis = 1, how = 'any', inplace = True)

    assert var_df.index.equals(ref_df.index) and var_df.columns.equals(ref_df.columns)

    # filtering out the SNVs where MutationType is -1 (neither class)
    snvs_to_remove = target_df[target_df['MutationType'] == -1]['SNV']
    var_df_filtered = var_df[~var_df.index.isin(snvs_to_remove)]
    ref_df_filtered = ref_df[~ref_df.index.isin(snvs_to_remove)]
    target_df_filtered = target_df[~target_df['SNV'].isin(snvs_to_remove)]

    print('var_filt shape:', var_df_filtered.shape, 'ref_filt shape:', ref_df_filtered.shape, 'target_filt shape:', target_df_filtered.shape)

    return var_df_filtered, ref_df_filtered, target_df_filtered['MutationType'].values

In [None]:
def preprocess_data(var_df, ref_df, y_arr, explained_variance_threshold=None, apply_svd=False, sample_id=None, output_dir=None):
    X_var = var_df.values[:, 1:]  # Ignoring the SNV index
    X_ref = ref_df.values[:, 1:]
    y_arr = y_arr.astype(int)

    # convert classes to binary
    y_arr[y_arr == 1] = 0  # DNA class
    y_arr[y_arr == 2] = 1  # RNA class

    # shuffle data
    combined_data = shuffle(np.hstack((X_var, X_ref, y_arr.reshape(-1, 1))))
    X_var_shuffled = combined_data[:, :X_var.shape[1]]
    X_ref_shuffled = combined_data[:, X_var.shape[1]:X_var.shape[1] + X_ref.shape[1]]
    y_shuffled = combined_data[:, -1].astype(int)

    # train-val-test split
    X_var_temp, X_var_test, X_ref_temp, X_ref_test, y_temp, y_test = train_test_split(X_var_shuffled, X_ref_shuffled, y_shuffled,
                                                                                      test_size = 0.2, random_state = 1, stratify = y_shuffled)

    X_var_train, X_var_val, X_ref_train, X_ref_val, y_train, y_val = train_test_split(X_var_temp, X_ref_temp, y_temp,
                                                                                      test_size = 0.25, random_state = 1, stratify = y_temp)

    # Scale data
    scaler_var = StandardScaler()
    X_var_train_scaled = scaler_var.fit_transform(X_var_train)
    X_var_val_scaled = scaler_var.transform(X_var_val)
    X_var_test_scaled = scaler_var.transform(X_var_test)

    scaler_ref = StandardScaler()
    X_ref_train_scaled = scaler_ref.fit_transform(X_ref_train)
    X_ref_val_scaled = scaler_ref.transform(X_ref_val)
    X_ref_test_scaled = scaler_ref.transform(X_ref_test)


    # Scale data using MaxAbsScaler
    # scaler_var = MaxAbsScaler()
    # X_var_train_scaled = scaler_var.fit_transform(X_var_train)
    # X_var_val_scaled = scaler_var.transform(X_var_val)
    # X_var_test_scaled = scaler_var.transform(X_var_test)

    # scaler_ref = MaxAbsScaler()
    # X_ref_train_scaled = scaler_ref.fit_transform(X_ref_train)
    # X_ref_val_scaled = scaler_ref.transform(X_ref_val)
    # X_ref_test_scaled = scaler_ref.transform(X_ref_test)


    # Combine varreads and refreads into 3D tensors (samples, features, channels)
    X_train_combined = np.stack((X_var_train_scaled, X_ref_train_scaled), axis = -1)
    X_val_combined = np.stack((X_var_val_scaled, X_ref_val_scaled), axis = -1)
    X_test_combined = np.stack((X_var_test_scaled, X_ref_test_scaled), axis = -1)

    return X_train_combined, X_val_combined, X_test_combined, y_train, y_val, y_test

In [None]:
def build_1x1_cnn_model(input_shape):
    model = Sequential()
    model.add(layers.Input(shape = input_shape))

    # first 1x1 convolution Block
    model.add(layers.Conv1D(filters = 256, kernel_size=1, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    # second 1x1 convolution Block
    model.add(layers.Conv1D(filters = 256, kernel_size = 1, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    # third 1x1 convolution Block
    model.add(layers.Conv1D(filters = 256, kernel_size = 1, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    model.add(layers.GlobalAveragePooling1D())

    # fully connected layers
    model.add(layers.Dense(128, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.4))

    model.add(layers.Dense(64, activation = 'relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.4))

    model.add(layers.Dense(1, activation = 'sigmoid'))


    model.compile(
        optimizer = Adam(learning_rate = 0.001),
        loss = focal_loss(alpha = 0.25, gamma = 2.0),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
            tf.keras.metrics.Precision(name = 'precision'),
            tf.keras.metrics.Recall(name = 'recall'),
            tf.keras.metrics.AUC(name = 'auc'),
        ]
    )

    return model

In [None]:
def evaluate_model(model, X_test, y_test, sample_id, output_dir):
    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int)

    # evaluation metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average = 'macro')
    precision_macro = precision_score(y_test, y_pred, average = 'macro')
    recall_macro = recall_score(y_test, y_pred, average = 'macro')
    roc_auc = roc_auc_score(y_test, y_pred_probs)
    kappa = cohen_kappa_score(y_test, y_pred)

    plt.figure(figsize = (7, 5))
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = 'Blues',
                xticklabels = ['DNA', 'RNA'], yticklabels = ['DNA', 'RNA'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {sample_id}')
    image_path = os.path.join(output_dir, f'{sample_id}_conf_mat.png')
    plt.savefig(image_path, dpi = 300)
    plt.close()

    return f1_macro, kappa, precision_macro, recall_macro, roc_auc, conf_matrix

In [None]:
def focal_loss(alpha = 0.25, gamma = 2.0):
    def loss(y_true, y_pred):
        # clipping predictions to avoid log(0) error
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())

        pt = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        fl = -alpha * (1 - pt) ** gamma * K.log(pt)
        return K.mean(fl, axis=0)
    return loss

In [None]:
output_dir_base = "cnn_binary_results_focal/cnn_fixed_architecture_focal"

sample_ids = ['nanopore_SRR21492154', 'nanopore_SRR21492155', 'nanopore_SRR21492156', 'nanopore_SRR21492158', 'nanopore_SRR21492159']

explained_variance_thresholds = [0.95]

all_results = []

for sample_id in sample_ids:
    var_path = f"{sample_id}_varreads.csv"
    ref_path = f"{sample_id}_refreads.csv"
    target_path = f"{sample_id}_targets.csv"
    print(f"\nProcessing sample: {sample_id}")

    # load data
    var_df, ref_df, y_arr = load_sample_data(var_path, ref_path, target_path)

    for explained_variance_threshold in explained_variance_thresholds + [None]:
        if explained_variance_threshold is None:
            output_dir = os.path.join(output_dir_base, "no_svd")
        else:
            output_dir = os.path.join(output_dir_base, f"svd_{int(explained_variance_threshold * 100)}_variance")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    # preprocessing call
    X_train_combined_final, X_val_combined_final, X_test_combined_final, y_train, y_val, y_test = preprocess_data(var_df, ref_df, y_arr, apply_svd = False)

    y_train = y_train.astype(int)
    y_val = y_val.astype(int)
    y_test = y_test.astype(int)
    print('input shape:', X_train_combined_final.shape[1:])


    # model training and evaluation calls
    model = build_1x1_cnn_model(X_train_combined_final.shape[1:])

    callbacks = [EarlyStopping(monitor = 'val_loss', min_delta = 0.001, patience = 10, restore_best_weights = True),
                 TensorBoard(log_dir = os.path.join(output_dir_base, f"logs/{sample_id}"))]

    model.fit(X_train_combined_final, y_train,
              validation_data = (X_val_combined_final, y_val),
              epochs = 50, callbacks = callbacks, verbose = 2)

    f1_macro, kappa, precision_macro, recall_macro, roc_auc, conf_matrix = evaluate_model(model, X_test_combined_final,
                                                                                          y_test, sample_id, output_dir)

    all_results.append({
        'sample_id': sample_id,
        'f1_macro': f1_macro,
        'kappa': kappa,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'roc_auc': roc_auc,
    })

results_df = pd.DataFrame(all_results)
results_csv_path = os.path.join(output_dir_base, 'cnn_1x1_results.csv')
results_df.to_csv(results_csv_path, index = False)
print("All results saved to CSV.")