# Deep Learning Baseline: 1D CNN

## Load HG38 Dataset

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/bioproj01"
DATA_DIR = os.path.join(PROJECT_DIR, "data")
print(f"Data directory found: {PROJECT_DIR}\nContents: {os.listdir(PROJECT_DIR)}")

Mounted at /content/drive
Data directory found: /content/drive/MyDrive/bioproj01
Contents: ['data']


In [2]:
import pandas as pd

In [3]:
hg38_df = pd.read_csv(
    f"{DATA_DIR}/hg38/human_promoter_vs_nonpromoter_10k_400bp.csv"
)

hg38_df = hg38_df[["sequence", "label"]]
hg38_df.head()

Unnamed: 0,sequence,label
0,TGCATATTATTTTATATGCATCTATTTTGAATCTTCATAAATGTAA...,0
1,GGCCCAGCTCTGACGCCAGGCTGTCTTGCCTCTGCTCACCTGCAGC...,1
2,TCATGCCTGGCCAGCAAAATTGTTTTTTAAAAGTTTATGCTACTAA...,1
3,TGCCTGGTTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCAT...,0
4,AATAATTGAAATAAGCTTAATAAATGGGCTCAAAAGAATGAAAGAG...,0


## One-Hot Encode DNA Sequences

In [4]:
import numpy as np

In [5]:
BASE2IDX = {
    "A": 0,
    "C": 1,
    "G": 2,
    "T": 3
}

def one_hot_encode(seq):
    arr = np.zeros((len(seq), 4), dtype=np.float32)
    for i, base in enumerate(seq):
        if base in BASE2IDX:
            arr[i, BASE2IDX[base]] = 1.0
    return arr

X = np.stack([one_hot_encode(seq) for seq in hg38_df["sequence"].values])
y = hg38_df["label"].values.astype(np.int64)

print(X.shape)

(19994, 400, 4)


## Train / Validation Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(X_train.shape, X_test.shape)

(15995, 400, 4) (3999, 400, 4)


## Build the CNN Model (Keras + TensorFlow)

In [16]:
!nvidia-smi

Tue Jan 13 01:43:22 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             55W /  400W |    3009MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [17]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [19]:
from keras import Sequential, Input
from keras.layers import (
    Conv1D, MaxPooling1D, GlobalMaxPooling1D,
    Dense, Dropout, BatchNormalization
)

In [11]:
model = Sequential([
    Input(shape=(400, 4)),

    Conv1D(64, kernel_size=15, activation="relu"),
    BatchNormalization(),
    MaxPooling1D(pool_size=4),

    Conv1D(128, kernel_size=7, activation="relu"),
    BatchNormalization(),
    GlobalMaxPooling1D(),

    Dense(128, activation="relu"),
    Dropout(0.5),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

## Train the CNN

In [18]:
from keras.callbacks import EarlyStopping

In [13]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=20,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

pd.DataFrame(history.history)

Epoch 1/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.6025 - loss: 0.7962 - val_accuracy: 0.5331 - val_loss: 0.6602
Epoch 2/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7448 - loss: 0.4896 - val_accuracy: 0.7619 - val_loss: 0.4867
Epoch 3/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7694 - loss: 0.4631 - val_accuracy: 0.7400 - val_loss: 0.4709
Epoch 4/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7842 - loss: 0.4395 - val_accuracy: 0.7700 - val_loss: 0.4451
Epoch 5/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7941 - loss: 0.4187 - val_accuracy: 0.7294 - val_loss: 0.5017
Epoch 6/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8093 - loss: 0.3926 - val_accuracy: 0.7644 - val_loss: 0.5069
Epoch 7/20
[1m225/225[0m

Unnamed: 0,accuracy,loss,val_accuracy,val_loss
0,0.660854,0.623448,0.533125,0.660183
1,0.741855,0.493096,0.761875,0.486705
2,0.766933,0.46405,0.74,0.470921
3,0.779229,0.446366,0.77,0.445131
4,0.795415,0.422976,0.729375,0.501677
5,0.809378,0.398351,0.764375,0.506928
6,0.824939,0.377155,0.72,0.53619


## Evaluate the CNN

In [14]:
from sklearn.metrics import (
    accuracy_score, f1_score,
    roc_auc_score, matthews_corrcoef
)

In [15]:
y_prob = model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))
print("AUROC    :", roc_auc_score(y_test, y_prob))
print("MCC      :", matthews_corrcoef(y_test, y_pred))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Accuracy : 0.7539384846211553
F1       : 0.7508860759493671
AUROC    : 0.8471715857928964
MCC      : 0.5080203901628616
