In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tensorflow



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import math
import os
import time
import pandas as pd


LOG_DIR = '/content/drive/MyDrive/1149108/msCNN-ETC/Logs'
os.makedirs(LOG_DIR, exist_ok=True)

In [None]:
# Load data
X_train = np.load('/content/drive/MyDrive/1149108/msCNN-ETC/X_train.npy')
y_train = np.load('/content/drive/MyDrive/1149108/msCNN-ETC/y_train.npy')
X_test = np.load('/content/drive/MyDrive/1149108/msCNN-ETC/X_test.npy')
y_test = np.load('/content/drive/MyDrive/1149108/msCNN-ETC/y_test.npy')


print("data shape:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

In [None]:
X_train = X_train[:, np.newaxis, :, :]  # (samples, 1, height, width)
X_test = X_test[:, np.newaxis, :, :]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (6418, 1, 1022, 1280)
X_test shape: (1606, 1, 1022, 1280)


In [None]:
print(X_train.dtype)

float16


#Cross-Validation

In [None]:
import os
import time
import math

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from tensorflow.keras.saving import register_keras_serializable

# === Hyperparameters ===
BATCH_SIZE      = 64
NUM_CLASSES     = 1
EPOCHS          = 20
NUM_FILTERS     = 256
NUM_HIDDEN      = 1024
WINDOW_SIZES    = [8, 12, 16, 20, 24, 28]
MAX_SEQ_LENGTH  = 1022
EMBEDDING_WIDTH = 1280

LOG_MODEL = os.path.join(LOG_DIR, f'MODELS_{WINDOW_SIZES}_{NUM_FILTERS}F_{NUM_HIDDEN}H')
os.makedirs(LOG_MODEL, exist_ok=True)

@register_keras_serializable()
def DeepScan(input_shape=(1, MAX_SEQ_LENGTH, EMBEDDING_WIDTH),
             window_sizes=WINDOW_SIZES,
             num_filters=NUM_FILTERS,
             num_hidden=NUM_HIDDEN,
             num_classes=NUM_CLASSES):
    inputs = tf.keras.Input(shape=input_shape)

    branches = []
    for ws in window_sizes:
        x = layers.SeparableConv2D(
            filters=num_filters,
            kernel_size=(1, ws),
            strides=(1, 1),
            activation='relu',
            padding='valid',
            depthwise_regularizer=tf.keras.regularizers.l2(1e-4),
            pointwise_regularizer=tf.keras.regularizers.l2(1e-4),
            depthwise_initializer='glorot_uniform',
            pointwise_initializer='glorot_uniform'
        )(inputs)

        x = layers.MaxPooling2D(
            pool_size=(1, MAX_SEQ_LENGTH - ws + 1),
            strides=(1, 1),
            padding='valid'
        )(x)

        x = layers.Flatten()(x)
        branches.append(x)

    x = layers.Concatenate()(branches)
    x = layers.Dropout(0.8)(x)
    x = layers.Dense(num_hidden, activation='relu', name='fc1')(x)
    outputs = layers.Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='DeepScan')
    return model

# === Callback on CV folds ===
class MetricsCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val, y_val, fold):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.fold  = fold
        self.fold_start_time = time.time()

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start = time.time()

    def on_epoch_end(self, epoch, logs=None):
        y_pred_probs  = self.model.predict(self.X_val, batch_size=BATCH_SIZE, verbose=0).ravel()
        y_pred_labels = (y_pred_probs >= 0.5).astype(int)

        cm = metrics.confusion_matrix(self.y_val, y_pred_labels)
        if cm.size == 1:
            if y_pred_labels[0] == 1:
                TN, FP, FN, TP = 0, 0, cm[0,0], 0
            else:
                TN, FP, FN, TP = cm[0,0], 0, 0, 0
        else:
            TN, FP, FN, TP = cm.ravel()

        Sens = TP/(TP+FN) if TP+FN>0 else 0
        Spec = TN/(TN+FP) if TN+FP>0 else 0
        Acc  = (TP+TN)/(TP+FP+TN+FN) if TP+FP+TN+FN>0 else 0
        denom = (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)
        MCC = (TP*TN - FP*FN)/math.sqrt(denom) if denom>0 else 0
        F1  = 2*TP/(2*TP + FP + FN) if 2*TP+FP+FN>0 else 0

        # AUC theo epoch trên VALIDATION 
        fpr, tpr, _ = roc_curve(self.y_val, y_pred_probs)
        roc_auc = auc(fpr, tpr)

        epoch_time = time.time() - self.epoch_start
        results.loc[len(results)] = [
            'CV', self.fold, epoch+1, TP, FP, TN, FN,
            Sens, Spec, Acc, MCC, F1, roc_auc,
            None, epoch_time, self.model.count_params()
        ]

class SaveEveryEpochCallback(tf.keras.callbacks.Callback):
    def __init__(self, base_dir, stage='CV', fold=None):
        super().__init__()
        self.base_dir = base_dir
        self.stage = stage
        self.fold = fold
        self.sub_dir = os.path.join(base_dir, stage, f'fold_{fold}' if fold is not None else '')
        os.makedirs(self.sub_dir, exist_ok=True)

    def on_epoch_end(self, epoch, logs=None):
        filename = f"{self.stage}_fold{self.fold}_epoch{epoch+1:02d}.keras" if self.fold is not None else f"{self.stage}_epoch{epoch+1:02d}.keras"
        path = os.path.join(self.sub_dir, filename)
        self.model.save(path)
        print(f"[Saved model] Epoch {epoch+1} saved to {path}")

# === Callback on independent test ===
class FinalMetricsCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_test, y_test):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test
        self.epoch_start_time = time.time()

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        pass

# === DataFrame Column ===
results_columns = [
    'Stage', 'Fold', 'Epoch', 'TP', 'FP', 'TN', 'FN',
    'Sens', 'Spec', 'Acc', 'MCC', 'F1', 'AUC',
    'Train_Time', 'Epoch_Time', 'Total_Params'
]
results = pd.DataFrame(columns=results_columns)

# =========================================================
# 1) 5-Fold Cross-Validation 
#   
# =========================================================
best_epochs = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train), start=1):
    print(f"\n=== Fold {fold}/5 ===")
    X_train_fold, X_val = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val = y_train[train_idx], y_train[val_idx]

    model = DeepScan(window_sizes=WINDOW_SIZES,
                     num_filters=NUM_FILTERS,
                     num_hidden=NUM_HIDDEN)
    model.build((None, 1, MAX_SEQ_LENGTH, EMBEDDING_WIDTH))
    model.summary()
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    metrics_cb = MetricsCallback(X_val, y_val, fold)
    tb_cb = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(LOG_DIR, f'fold_{fold}'))
    early_stop_cb = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=5, restore_best_weights=True, mode='max')
    save_cb = SaveEveryEpochCallback(base_dir=LOG_MODEL, stage='CV', fold=fold)
    checkpoint_path = os.path.join(LOG_MODEL, f'best_model__{WINDOW_SIZES}_{NUM_FILTERS}F_{NUM_HIDDEN}H_CV_{fold}.keras')
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_auc', save_best_only=True, mode='max')

    start_fold = time.time()
    history = model.fit(
        X_train_fold, y_train_fold,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        callbacks=[metrics_cb, tb_cb, early_stop_cb, save_cb, checkpoint_cb],
        verbose=1,
    )
    train_time = time.time() - start_fold
    results.loc[(results.Stage=='CV') & (results.Fold==fold), 'Train_Time'] = train_time

    val_auc_list = history.history.get('val_auc', None)
    if val_auc_list is None or len(val_auc_list) == 0:
        raise RuntimeError("No val_auc found in training history.")
    best_epoch_fold = int(np.argmax(val_auc_list) + 1)
    best_epochs.append(best_epoch_fold)
    print(f"[Fold {fold}] Best epoch by val_auc = {best_epoch_fold}, val_auc = {val_auc_list[best_epoch_fold-1]:.4f}")

# Save CV results
results_path = os.path.join(
    LOG_DIR,
    f'training_results_{WINDOW_SIZES}_{NUM_FILTERS}F_{NUM_HIDDEN}H.csv'
)
results.to_csv(results_path, index=False)
print(f"\nCV results saved to {results_path}")

# =========================================================
# Final training on full TRAIN set with selected epochs
# =========================================================
final_epochs = int(np.median(best_epochs)) if len(best_epochs) > 0 else EPOCHS
final_epochs = max(1, min(final_epochs, EPOCHS))  # giữ trong [1, EPOCHS]
print(f"\nSelected final_epochs (median best epochs from CV) = {final_epochs}")
print(f"best_epochs per fold = {best_epochs}")

final_model = DeepScan(window_sizes=WINDOW_SIZES,
                      num_filters=NUM_FILTERS,
                      num_hidden=NUM_HIDDEN)
final_model.build((None, 1, MAX_SEQ_LENGTH, EMBEDDING_WIDTH))
final_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

save_cb = SaveEveryEpochCallback(base_dir=LOG_MODEL, stage='Independent')

start_final = time.time()
history = final_model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=final_epochs,
    callbacks=[save_cb],
    verbose=1
)
final_train_time = time.time() - start_final

final_model_path = os.path.join(LOG_MODEL, f'final_model__{WINDOW_SIZES}_{NUM_FILTERS}F_{NUM_HIDDEN}H_epochs{final_epochs}.keras')
final_model.save(final_model_path)
print(f"\nFinal model saved to {final_model_path}")

# =========================================================
#Independent evaluation
# =========================================================
t0 = time.time()
y_pred_probs  = final_model.predict(X_test, batch_size=BATCH_SIZE, verbose=0).ravel()
epoch_time = time.time() - t0
y_pred_labels = (y_pred_probs >= 0.5).astype(int)

cm = metrics.confusion_matrix(y_test, y_pred_labels)
if cm.size == 1:
    if y_pred_labels[0] == 1:
        TN, FP, FN, TP = 0, 0, cm[0,0], 0
    else:
        TN, FP, FN, TP = cm[0,0], 0, 0, 0
else:
    TN, FP, FN, TP = cm.ravel()

Sens = TP/(TP+FN) if TP+FN>0 else 0
Spec = TN/(TN+FP) if TN+FP>0 else 0
Acc  = (TP+TN)/(TP+FP+TN+FN) if TP+FP+TN+FN>0 else 0
denom = (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)
MCC = (TP*TN - FP*FN)/math.sqrt(denom) if denom>0 else 0
F1  = 2*TP/(2*TP + FP + FN) if 2*TP+FP+FN>0 else 0

fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)

results.loc[len(results)] = [
    'Independent', 'Final', final_epochs, TP, FP, TN, FN,
    Sens, Spec, Acc, MCC, F1, roc_auc,
    final_train_time, epoch_time, final_model.count_params()
]

# Save final results
results.to_csv(results_path, index=False)
print(f"\nFinal results saved to {results_path}")

print("\n=== Independent Test (X_test) ===")
print(f"AUC={roc_auc:.4f} | Acc={Acc:.4f} | Sens={Sens:.4f} | Spec={Spec:.4f} | MCC={MCC:.4f} | F1={F1:.4f}")



=== Fold 1/5 ===


Epoch 1/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7953 - auc: 0.5806 - loss: 0.8449[Saved model] Epoch 1 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_1/CV_fold1_epoch01.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 2s/step - accuracy: 0.7957 - auc: 0.5815 - loss: 0.8436 - val_accuracy: 0.9307 - val_auc: 0.9779 - val_loss: 0.5119
Epoch 2/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8767 - auc: 0.8138 - loss: 0.5928[Saved model] Epoch 2 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_1/CV_fold1_epoch02.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 2s/step - accuracy: 0.8768 - auc: 0.8142 - loss: 0.5924 - val_accuracy: 0.9579 - val_auc: 0.9885 - val_loss: 0.4271
Epoch 3/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━

Epoch 1/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8114 - auc: 0.6001 - loss: 0.7831[Saved model] Epoch 1 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_2/CV_fold2_epoch01.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 2s/step - accuracy: 0.8116 - auc: 0.6009 - loss: 0.7823 - val_accuracy: 0.8754 - val_auc: 0.9755 - val_loss: 0.5061
Epoch 2/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8695 - auc: 0.8206 - loss: 0.5901[Saved model] Epoch 2 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_2/CV_fold2_epoch02.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 2s/step - accuracy: 0.8697 - auc: 0.8209 - loss: 0.5898 - val_accuracy: 0.9291 - val_auc: 0.9880 - val_loss: 0.4120
Epoch 3/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━

Epoch 1/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7783 - auc: 0.5734 - loss: 1.0228[Saved model] Epoch 1 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_3/CV_fold3_epoch01.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 2s/step - accuracy: 0.7787 - auc: 0.5741 - loss: 1.0203 - val_accuracy: 0.8629 - val_auc: 0.9326 - val_loss: 0.5617
Epoch 2/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8750 - auc: 0.8311 - loss: 0.5746[Saved model] Epoch 2 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_3/CV_fold3_epoch02.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 2s/step - accuracy: 0.8750 - auc: 0.8314 - loss: 0.5744 - val_accuracy: 0.8988 - val_auc: 0.9752 - val_loss: 0.4963
Epoch 3/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━

Epoch 1/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8140 - auc: 0.6089 - loss: 0.8181[Saved model] Epoch 1 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_4/CV_fold4_epoch01.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 2s/step - accuracy: 0.8142 - auc: 0.6098 - loss: 0.8171 - val_accuracy: 0.9314 - val_auc: 0.9577 - val_loss: 0.5118
Epoch 2/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8683 - auc: 0.8346 - loss: 0.5837[Saved model] Epoch 2 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_4/CV_fold4_epoch02.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 2s/step - accuracy: 0.8685 - auc: 0.8349 - loss: 0.5833 - val_accuracy: 0.9470 - val_auc: 0.9732 - val_loss: 0.4147
Epoch 3/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━

Epoch 1/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7923 - auc: 0.5907 - loss: 0.9030[Saved model] Epoch 1 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_5/CV_fold5_epoch01.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 2s/step - accuracy: 0.7927 - auc: 0.5916 - loss: 0.9013 - val_accuracy: 0.8893 - val_auc: 0.9605 - val_loss: 0.5015
Epoch 2/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8740 - auc: 0.8242 - loss: 0.5805[Saved model] Epoch 2 saved to /content/drive/MyDrive/1136052/mCNN-ETC/Log_New_20-06-2025/MODELS_[8, 12, 16, 20, 24, 28]_256F_1024H/CV/fold_5/CV_fold5_epoch02.keras
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 2s/step - accuracy: 0.8740 - auc: 0.8246 - loss: 0.5802 - val_accuracy: 0.9439 - val_auc: 0.9832 - val_loss: 0.4105
Epoch 3/20
[1m81/81[0m [32m━━━━━━━━━━━━━━━━