In [11]:
import argparse
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tabulate import tabulate
from sklearn.model_selection import KFold

from model import BertCustomBinaryClassifier
from utils.ensemble_utils import make_predictions
from utils.evaluate_metrics import evaluate_metrics
from utils.data_preprocessing import load_dataset

In [12]:
import logging
logging.getLogger("transforkmer_values.modeling_utils").setLevel(logging.ERROR)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", type=int, default=64, help="")
parser.add_argument("--max_length", type=int, default=200, help="")
args = parser.parse_args(args=[])

# **Cross-validation**

## **5-fold**

In [None]:
threshold = 0.50
num_folds = 5
kmer_values = [3, 4, 5, 6]
model_date = "2025-02-27_V1"
train_predictions_list, test_predictions_list = [], []
train_labels_list, test_labels_list = [], []

print(f"Threshold: {threshold}")
print(f"Classifier model date: {model_date}")
print(f"Number of folds: {num_folds}\n")

results = []
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)  

for kmer in kmer_values:
    args.model_path = f"./outputs/classifier_models/{model_date}/{kmer}-mer"
    args.train_data_path = f"./data/enhancer_classification/{kmer}-mer_classification_train.txt"

    # Load dataset
    dataset = load_dataset(args, validation=False)
    dataset_size = len(dataset)
    dataset_indices = np.arange(dataset_size)

    print(f"Processing {kmer}-mer:")

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_indices)): 
        # Create train and validation subsets
        train_subset = [dataset[i] for i in train_idx]
        val_subset = [dataset[i] for i in val_idx]

        # Create data loaders
        train_dataloader = DataLoader(train_subset, batch_size=args.batch_size, shuffle=False)
        val_dataloader = DataLoader(val_subset, batch_size=args.batch_size, shuffle=False)

        # Model
        model = BertCustomBinaryClassifier.from_pretrained(args.model_path, num_labels=1).to(device)

        # Train dataset prediction
        train_predictions, train_labels = make_predictions(model, train_dataloader, kmer=kmer)
        train_predictions_list.append(train_predictions)
        train_labels_list.append(train_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(train_predictions, train_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Train", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        # Validation dataset prediction
        val_predictions, val_labels = make_predictions(model, val_dataloader, kmer=kmer)
        test_predictions_list.append(val_predictions)
        test_labels_list.append(val_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(val_predictions, val_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Validation", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        print(f"Validation {kmer}-mer - Fold {fold + 1}: ACC={acc:.4f}, SN={sn:.4f}, SP={sp:.4f}, MCC={mcc:.4f}, AUC={auc:.4f}")
    print("\n")

Threshold: 0.5
Classifier model date: 2025-02-27
Number of folds: 5

Processing 3-mer:
Validation 3-mer - Fold 1: ACC=0.8418, SN=0.7683, SP=0.9323, MCC=0.6997, AUC=0.8790
Validation 3-mer - Fold 2: ACC=0.9057, SN=0.8284, SP=0.9693, MCC=0.8136, AUC=0.9346
Validation 3-mer - Fold 3: ACC=0.8990, SN=0.8582, SP=0.9359, MCC=0.7986, AUC=0.9086
Validation 3-mer - Fold 4: ACC=0.8956, SN=0.8298, SP=0.9551, MCC=0.7948, AUC=0.9309
Validation 3-mer - Fold 5: ACC=0.9020, SN=0.8704, SP=0.9403, MCC=0.8071, AUC=0.9338


Processing 4-mer:
Validation 4-mer - Fold 1: ACC=0.8822, SN=0.8354, SP=0.9398, MCC=0.7712, AUC=0.8938
Validation 4-mer - Fold 2: ACC=0.9428, SN=0.9030, SP=0.9755, MCC=0.8854, AUC=0.9649
Validation 4-mer - Fold 3: ACC=0.9024, SN=0.8582, SP=0.9423, MCC=0.8057, AUC=0.8990
Validation 4-mer - Fold 4: ACC=0.9192, SN=0.8511, SP=0.9808, MCC=0.8433, AUC=0.9330
Validation 4-mer - Fold 5: ACC=0.9088, SN=0.8765, SP=0.9478, MCC=0.8206, AUC=0.9180


Processing 5-mer:
Validation 5-mer - Fold 1: ACC=0.

In [16]:
results_5_df = pd.DataFrame(results, columns=["k-mer", "Dataset", "Fold", "Accuracy", "Sensitivity", "Specificity", "MCC", "AUC"])

In [17]:
training_5_df = results_5_df[results_5_df['Dataset'].str.contains("Train")]
print(f"{num_folds}-fold training results:")
print(tabulate(training_5_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

5-fold training results:
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |      1 |     0.9006 |        0.8478 |        0.9507 | 0.8044 | 0.9267 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      2 |     0.8846 |        0.8306 |        0.9413 | 0.7749 | 0.9119 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      3 |     0.8863 |        0.8236 |        0.9505 | 0.7794 | 0.9184 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      4 |     0.8871 |        0.8303 |        0.9454 | 0.7800 | 0.9125 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Tra

In [18]:
validation_5_df = results_5_df[results_5_df['Dataset'].str.contains("Validation")]
print(f"{num_folds}-fold validation results:")
print(tabulate(validation_5_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

5-fold validation results:
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset    |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Validation |      1 |     0.8418 |        0.7683 |        0.9323 | 0.6997 | 0.8790 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      2 |     0.9057 |        0.8284 |        0.9693 | 0.8136 | 0.9346 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      3 |     0.8990 |        0.8582 |        0.9359 | 0.7986 | 0.9086 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      4 |     0.8956 |        0.8298 |        0.9551 | 0.7948 | 0.9309 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|  

In [19]:
# Calculate average metrics for each k-mer across the k folds
average_5_results = results_5_df.groupby(['k-mer', 'Dataset']).mean(numeric_only=True).reset_index()
average_5_results = average_5_results.drop(columns=['Fold'])

average_training_5_results = average_5_results[average_5_results['Dataset'].str.contains("Train")]
average_validation_5_results = average_5_results[average_5_results['Dataset'].str.contains("Validation")]

print(f"Average {num_folds}-fold training results:")
print(tabulate(average_training_5_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

print(f"\nAverage {num_folds}-fold validation results:")
print(tabulate(average_validation_5_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

Average 5-fold training results:
+---------+-----------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |     0.8888 |        0.8302 |        0.9474 | 0.7830 | 0.9163 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       4 | Train     |     0.9111 |        0.8639 |        0.9582 | 0.8258 | 0.9218 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       5 | Train     |     0.8982 |        0.8330 |        0.9636 | 0.8034 | 0.8944 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       6 | Train     |     0.9043 |        0.8424 |        0.9663 | 0.8149 | 0.8987 |
+---------+-----------+------------+---------------+---------------+--------+--------+

Average 5-fold validation results:
+---------+------------+------------+---------------+--------

## **10-fold**

In [25]:
threshold = 0.50
num_folds = 10
kmer_values = [3, 4, 5, 6]
model_date = "2025-02-27"
train_predictions_list, test_predictions_list = [], []
train_labels_list, test_labels_list = [], []

print(f"Threshold: {threshold}")
print(f"Classifier model date: {model_date}")
print(f"Number of folds: {num_folds}\n")

results = []
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)  

for kmer in kmer_values:
    args.model_path = f"./outputs/classifier_models/{model_date}/{kmer}-mer"
    args.train_data_path = f"./data/enhancer_classification/{kmer}-mer_classification_train.txt"

    # Load dataset
    dataset = load_dataset(args, validation=False)
    dataset_size = len(dataset)
    dataset_indices = np.arange(dataset_size)

    print(f"Processing {kmer}-mer:")

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_indices)): 
        # Create train and validation subsets
        train_subset = [dataset[i] for i in train_idx]
        val_subset = [dataset[i] for i in val_idx]

        # Create data loaders
        train_dataloader = DataLoader(train_subset, batch_size=args.batch_size, shuffle=False)
        val_dataloader = DataLoader(val_subset, batch_size=args.batch_size, shuffle=False)

        # Model
        model = BertCustomBinaryClassifier.from_pretrained(args.model_path, num_labels=1).to(device)

        # Train dataset prediction
        train_predictions, train_labels = make_predictions(model, train_dataloader, kmer=kmer)
        train_predictions_list.append(train_predictions)
        train_labels_list.append(train_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(train_predictions, train_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Train", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        # Validation dataset prediction
        val_predictions, val_labels = make_predictions(model, val_dataloader, kmer=kmer)
        test_predictions_list.append(val_predictions)
        test_labels_list.append(val_labels)

        acc, sn, sp, mcc, auc = evaluate_metrics(val_predictions, val_labels)
        results.append({"k-mer": kmer, "Fold": fold + 1, "Dataset": "Validation", 
                        "Accuracy": acc, "Sensitivity": sn, "Specificity": sp, "MCC": mcc, "AUC": auc})
        
        print(f"Validation {kmer}-mer - Fold {fold + 1}: ACC={acc:.4f}, SN={sn:.4f}, SP={sp:.4f}, MCC={mcc:.4f}, AUC={auc:.4f}")
    print("\n")

Threshold: 0.5
Classifier model date: 2025-02-27
Number of folds: 10

Processing 3-mer:
Validation 3-mer - Fold 1: ACC=0.8456, SN=0.7750, SP=0.9275, MCC=0.7042, AUC=0.8964
Validation 3-mer - Fold 2: ACC=0.8389, SN=0.7619, SP=0.9385, MCC=0.6973, AUC=0.8595
Validation 3-mer - Fold 3: ACC=0.8993, SN=0.8281, SP=0.9529, MCC=0.7955, AUC=0.9208
Validation 3-mer - Fold 4: ACC=0.9128, SN=0.8333, SP=0.9870, MCC=0.8337, AUC=0.9518
Validation 3-mer - Fold 5: ACC=0.9189, SN=0.8571, SP=0.9647, MCC=0.8351, AUC=0.9235
Validation 3-mer - Fold 6: ACC=0.8784, SN=0.8553, SP=0.9028, MCC=0.7580, AUC=0.8862
Validation 3-mer - Fold 7: ACC=0.8986, SN=0.8308, SP=0.9518, MCC=0.7956, AUC=0.9234
Validation 3-mer - Fold 8: ACC=0.8919, SN=0.8289, SP=0.9583, MCC=0.7916, AUC=0.9368
Validation 3-mer - Fold 9: ACC=0.8919, SN=0.8706, SP=0.9206, MCC=0.7842, AUC=0.9197
Validation 3-mer - Fold 10: ACC=0.9122, SN=0.8701, SP=0.9577, MCC=0.8284, AUC=0.9493


Processing 4-mer:
Validation 4-mer - Fold 1: ACC=0.9060, SN=0.8375, S

In [26]:
results_10_df = pd.DataFrame(results, columns=["k-mer", "Dataset", "Fold", "Accuracy", "Sensitivity", "Specificity", "MCC", "AUC"])

In [27]:
training_10_df = results_10_df[results_10_df['Dataset'].str.contains("Train")]
print(f"{num_folds}-fold training results:")
print(tabulate(training_10_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

10-fold training results:
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |      1 |     0.8936 |        0.8369 |        0.9495 | 0.7919 | 0.9187 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      2 |     0.8944 |        0.8389 |        0.9483 | 0.7929 | 0.9229 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      3 |     0.8876 |        0.8304 |        0.9467 | 0.7813 | 0.9160 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Train     |      4 |     0.8861 |        0.8299 |        0.9429 | 0.7774 | 0.9124 |
+---------+-----------+--------+------------+---------------+---------------+--------+--------+
|       3 | Tr

In [28]:
validation_10_df = results_10_df[results_10_df['Dataset'].str.contains("Validation")]
print(f"{num_folds}-fold validation results:")
print(tabulate(validation_10_df, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

10-fold validation results:
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset    |   Fold |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Validation |      1 |     0.8456 |        0.7750 |        0.9275 | 0.7042 | 0.8964 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      2 |     0.8389 |        0.7619 |        0.9385 | 0.6973 | 0.8595 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      3 |     0.8993 |        0.8281 |        0.9529 | 0.7955 | 0.9208 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
|       3 | Validation |      4 |     0.9128 |        0.8333 |        0.9870 | 0.8337 | 0.9518 |
+---------+------------+--------+------------+---------------+---------------+--------+--------+
| 

In [29]:
# Calculate average metrics for each k-mer across the k folds
average_10_results = results_10_df.groupby(['k-mer', 'Dataset']).mean(numeric_only=True).reset_index()
average_10_results = average_10_results.drop(columns=['Fold'])

average_training_10_results = average_10_results[average_10_results['Dataset'].str.contains("Train")]
average_validation_10_results = average_10_results[average_10_results['Dataset'].str.contains("Validation")]

print(f"Average {num_folds}-fold training results:")
print(tabulate(average_training_10_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

print(f"\nAverage {num_folds}-fold validation results:")
print(tabulate(average_validation_10_results, headers="keys", tablefmt="grid", showindex=False, floatfmt=".4f"))

Average 10-fold training results:
+---------+-----------+------------+---------------+---------------+--------+--------+
|   k-mer | Dataset   |   Accuracy |   Sensitivity |   Specificity |    MCC |    AUC |
|       3 | Train     |     0.8888 |        0.8302 |        0.9474 | 0.7830 | 0.9162 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       4 | Train     |     0.9111 |        0.8639 |        0.9582 | 0.8258 | 0.9218 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       5 | Train     |     0.8982 |        0.8329 |        0.9636 | 0.8034 | 0.8944 |
+---------+-----------+------------+---------------+---------------+--------+--------+
|       6 | Train     |     0.9043 |        0.8423 |        0.9663 | 0.8149 | 0.8987 |
+---------+-----------+------------+---------------+---------------+--------+--------+

Average 10-fold validation results:
+---------+------------+------------+---------------+------