In [1]:
import argparse
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tabulate import tabulate
from sklearn.model_selection import KFold

from model import BertCustomBinaryClassifier
from utils.ensemble_utils import make_predictions
from utils.evaluate_metrics import evaluate_metrics
from utils.data_preprocessing import load_dataset

In [2]:
import logging
logging.getLogger("transforkmer_values.modeling_utils").setLevel(logging.ERROR)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", type=int, default=64, help="")
parser.add_argument("--max_length", type=int, default=200, help="")
args = parser.parse_args(args=[])

# **Cross-validation**

## **5-fold**

In [None]:
threshold = 0.50
num_folds = 5
kmer_values = [3, 4, 5, 6]
model_date = "2025-02-27_V1"
train_predictions_list, test_predictions_list = [], []
train_labels_list, test_labels_list = [], []

print(f"Threshold: {threshold}")
print(f"Identifier model date: {model_date}")
print(f"Number of folds: {num_folds}\n")

results = []
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)  

for kmer in kmer_values:
    args.model_path = f"./outputs/identifier_models/{model_date}/{kmer}-mer"
    args.train_data_path = f"./data/enhancer_identification/{kmer}-mer_identification_train.txt"

    # Load dataset
    dataset = load_dataset(args, validation=False)
    dataset_size = len(dataset)
    dataset_indices = np.arange(dataset_size)

    print(f"Processing {kmer}-mer:")

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_indices)): 
        # Create train and validation subsets
        train_subset = [dataset[i] for i in train_idx]
        val_subset = [dataset[i] for i in val_idx]

        # Create data loaders
        train_dataloader = DataLoader(train_subset, batch_size=args.batch_size, shuffle=False)
        val_dataloader = DataLoader(val_subset, batch_size=args.batch_size, shuffle=False)

        # Model
        model = BertCustomBinaryClassifier.from_pretrained(args.model_path, num_labels=1).to(device)

        # Train dataset prediction
        train_predictions, train_labels = make_predictions(model, train_dataloader, kmer=kmer)
        train_predictions_list.append(train_predictions)
        train_labels_list.append(train_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(train_predictions, train_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Train", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        # Validation dataset prediction
        val_predictions, val_labels = make_predictions(model, val_dataloader, kmer=kmer)
        test_predictions_list.append(val_predictions)
        test_labels_list.append(val_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(val_predictions, val_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Validation", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        print(f"Validation {kmer}-mer - Fold {fold + 1}: ACC={acc:.4f}, SN={sn:.4f}, SP={sp:.4f}, MCC={mcc:.4f}, AUC={auc:.4f}")
    print("\n")

Threshold: 0.5
Identifier model date: 2025-02-27
Number of folds: 5

Processing 3-mer:
Validation 3-mer - Fold 1: ACC=0.8939, SN=0.8253, SP=0.9603, MCC=0.7943, AUC=0.8925
Validation 3-mer - Fold 2: ACC=0.8822, SN=0.8201, SP=0.9410, MCC=0.7685, AUC=0.8837
Validation 3-mer - Fold 3: ACC=0.9158, SN=0.8810, SP=0.9541, MCC=0.8346, AUC=0.9073
Validation 3-mer - Fold 4: ACC=0.9140, SN=0.8850, SP=0.9412, MCC=0.8286, AUC=0.9237
Validation 3-mer - Fold 5: ACC=0.8938, SN=0.8262, SP=0.9653, MCC=0.7966, AUC=0.9017


Processing 4-mer:
Validation 4-mer - Fold 1: ACC=0.8939, SN=0.8356, SP=0.9503, MCC=0.7923, AUC=0.9043
Validation 4-mer - Fold 2: ACC=0.8973, SN=0.8339, SP=0.9574, MCC=0.7994, AUC=0.9202
Validation 4-mer - Fold 3: ACC=0.9057, SN=0.8521, SP=0.9647, MCC=0.8181, AUC=0.9074
Validation 4-mer - Fold 4: ACC=0.9292, SN=0.8885, SP=0.9673, MCC=0.8603, AUC=0.9385
Validation 4-mer - Fold 5: ACC=0.9073, SN=0.8426, SP=0.9757, MCC=0.8229, AUC=0.9162


Processing 5-mer:
Validation 5-mer - Fold 1: ACC=0.

In [6]:
results_5_df = pd.DataFrame(results, columns=["k-mer", "Dataset", "Fold", "Accuracy", "Sensitivity", "Specificity", "MCC", "AUC"])

In [7]:
training_5_df = results_5_df[results_5_df['Dataset'].str.contains("Train")]
print(f"{num_folds}-fold training results:")
print(tabulate(training_5_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

5-fold training results:
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |      1 |     0.9014 |        0.8532 |        0.9501 | 0.8068 | 0.9037 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      2 |     0.9044 |        0.8544 |        0.9550 | 0.8131 | 0.9059 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      3 |     0.8960 |        0.8389 |        0.9517 | 0.7965 | 0.9000 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      4 |     0.8964 |        0.8388 |        0.9550 | 0.7986 | 0.8964 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Tra

In [8]:
validation_5_df = results_5_df[results_5_df['Dataset'].str.contains("Validation")]
print(f"{num_folds}-fold validation results:")
print(tabulate(validation_5_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

5-fold validation results:
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset    |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Validation |      1 |     0.8939 |        0.8253 |        0.9603 | 0.7943 | 0.8925 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      2 |     0.8822 |        0.8201 |        0.9410 | 0.7685 | 0.8837 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      3 |     0.9158 |        0.8810 |        0.9541 | 0.8346 | 0.9073 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      4 |     0.9140 |        0.8850 |        0.9412 | 0.8286 | 0.9237 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|  

In [9]:
# Calculate average metrics for each k-mer across the k folds
average_5_results = results_5_df.groupby(['k-mer', 'Dataset']).mean(numeric_only=True).reset_index()
average_5_results = average_5_results.drop(columns=['Fold'])

average_training_5_results = average_5_results[average_5_results['Dataset'].str.contains("Train")]
average_validation_5_results = average_5_results[average_5_results['Dataset'].str.contains("Validation")]

print(f"Average {num_folds}-fold training results:")
print(tabulate(average_training_5_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

print(f"\nAverage {num_folds}-fold validation results:")
print(tabulate(average_validation_5_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

Average 5-fold training results:
+---------+-----------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |     0.8999 |        0.8477 |        0.9522 | 0.8043 | 0.9015 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       4 | Train     |     0.9067 |        0.8504 |        0.9629 | 0.8185 | 0.9174 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       5 | Train     |     0.8848 |        0.8228 |        0.9468 | 0.7755 | 0.9083 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       6 | Train     |     0.8757 |        0.7978 |        0.9535 | 0.7606 | 0.8963 |
+---------+-----------+------------+---------------+---------------+--------+--------+

Average 5-fold validation results:
+---------+------------+------------+---------------+--------

## **10-fold**

In [10]:
threshold = 0.50
num_folds = 10
kmer_values = [3, 4, 5, 6]
model_date = "2025-02-27"
train_predictions_list, test_predictions_list = [], []
train_labels_list, test_labels_list = [], []

print(f"Threshold: {threshold}")
print(f"Identifier model date: {model_date}")
print(f"Number of folds: {num_folds}\n")

results = []
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)  

for kmer in kmer_values:
    args.model_path = f"./outputs/identifier_models/{model_date}/{kmer}-mer"
    args.train_data_path = f"./data/enhancer_identification/{kmer}-mer_identification_train.txt"

    # Load dataset
    dataset = load_dataset(args, validation=False)
    dataset_size = len(dataset)
    dataset_indices = np.arange(dataset_size)

    print(f"Processing {kmer}-mer:")

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_indices)): 
        # Create train and validation subsets
        train_subset = [dataset[i] for i in train_idx]
        val_subset = [dataset[i] for i in val_idx]

        # Create data loaders
        train_dataloader = DataLoader(train_subset, batch_size=args.batch_size, shuffle=False)
        val_dataloader = DataLoader(val_subset, batch_size=args.batch_size, shuffle=False)

        # Model
        model = BertCustomBinaryClassifier.from_pretrained(args.model_path, num_labels=1).to(device)

        # Train dataset prediction
        train_predictions, train_labels = make_predictions(model, train_dataloader, kmer=kmer)
        train_predictions_list.append(train_predictions)
        train_labels_list.append(train_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(train_predictions, train_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Train", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        # Validation dataset prediction
        val_predictions, val_labels = make_predictions(model, val_dataloader, kmer=kmer)
        test_predictions_list.append(val_predictions)
        test_labels_list.append(val_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(val_predictions, val_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Validation", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        print(f"Validation {kmer}-mer - Fold {fold + 1}: ACC={acc:.4f}, SN={sn:.4f}, SP={sp:.4f}, MCC={mcc:.4f}, AUC={auc:.4f}")
    print("\n")

Threshold: 0.5
Identifier model date: 2025-02-27
Number of folds: 10

Processing 3-mer:
Validation 3-mer - Fold 1: ACC=0.8788, SN=0.8176, SP=0.9396, MCC=0.7631, AUC=0.8754
Validation 3-mer - Fold 2: ACC=0.9091, SN=0.8333, SP=0.9804, MCC=0.8256, AUC=0.9107
Validation 3-mer - Fold 3: ACC=0.8822, SN=0.8176, SP=0.9463, MCC=0.7705, AUC=0.8786
Validation 3-mer - Fold 4: ACC=0.8822, SN=0.8227, SP=0.9359, MCC=0.7665, AUC=0.8908
Validation 3-mer - Fold 5: ACC=0.9327, SN=0.9091, SP=0.9580, MCC=0.8667, AUC=0.9312
Validation 3-mer - Fold 6: ACC=0.8990, SN=0.8535, SP=0.9500, MCC=0.8032, AUC=0.8830
Validation 3-mer - Fold 7: ACC=0.9158, SN=0.8865, SP=0.9423, MCC=0.8317, AUC=0.9302
Validation 3-mer - Fold 8: ACC=0.9125, SN=0.8836, SP=0.9404, MCC=0.8259, AUC=0.9184
Validation 3-mer - Fold 9: ACC=0.9088, SN=0.8440, SP=0.9677, MCC=0.8217, AUC=0.9192
Validation 3-mer - Fold 10: ACC=0.8784, SN=0.8110, SP=0.9621, MCC=0.7703, AUC=0.8864


Processing 4-mer:
Validation 4-mer - Fold 1: ACC=0.8889, SN=0.8243, S

In [11]:
results_10_df = pd.DataFrame(results, columns=["k-mer", "Dataset", "Fold", "Accuracy", "Sensitivity", "Specificity", "MCC", "AUC"])

In [12]:
training_10_df = results_10_df[results_10_df['Dataset'].str.contains("Train")]
print(f"{num_folds}-fold training results:")
print(tabulate(training_10_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

10-fold training results:
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |      1 |     0.9023 |        0.8510 |        0.9536 | 0.8088 | 0.9045 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      2 |     0.8989 |        0.8493 |        0.9489 | 0.8019 | 0.9005 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      3 |     0.9019 |        0.8510 |        0.9528 | 0.8080 | 0.9041 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      4 |     0.9019 |        0.8503 |        0.9541 | 0.8084 | 0.9028 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Tr

In [13]:
validation_10_df = results_10_df[results_10_df['Dataset'].str.contains("Validation")]
print(f"{num_folds}-fold validation results:")
print(tabulate(validation_10_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

10-fold validation results:
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset    |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Validation |      1 |     0.8788 |        0.8176 |        0.9396 | 0.7631 | 0.8754 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      2 |     0.9091 |        0.8333 |        0.9804 | 0.8256 | 0.9107 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      3 |     0.8822 |        0.8176 |        0.9463 | 0.7705 | 0.8786 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      4 |     0.8822 |        0.8227 |        0.9359 | 0.7665 | 0.8908 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
| 

In [14]:
# Calculate average metrics for each k-mer across the k folds
average_10_results = results_10_df.groupby(['k-mer', 'Dataset']).mean(numeric_only=True).reset_index()
average_10_results = average_10_results.drop(columns=['Fold'])

average_training_10_results = average_10_results[average_10_results['Dataset'].str.contains("Train")]
average_validation_10_results = average_10_results[average_10_results['Dataset'].str.contains("Validation")]

print(f"Average {num_folds}-fold training results:")
print(tabulate(average_training_10_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

print(f"\nAverage {num_folds}-fold validation results:")
print(tabulate(average_validation_10_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

Average 10-fold training results:
+---------+-----------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |     0.8999 |        0.8477 |        0.9522 | 0.8043 | 0.9015 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       4 | Train     |     0.9067 |        0.8504 |        0.9629 | 0.8185 | 0.9174 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       5 | Train     |     0.8848 |        0.8228 |        0.9468 | 0.7755 | 0.9083 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       6 | Train     |     0.8757 |        0.7978 |        0.9535 | 0.7606 | 0.8963 |
+---------+-----------+------------+---------------+---------------+--------+--------+

Average 10-fold validation results:
+---------+------------+------------+---------------+------