# Deep Learning Baseline: 1D CNN

## Load HG38 Dataset

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/bioproj01"
DATA_DIR = os.path.join(PROJECT_DIR, "data")
print(f"Data directory found: {PROJECT_DIR}\nContents: {os.listdir(PROJECT_DIR)}")

Mounted at /content/drive
Data directory found: /content/drive/MyDrive/bioproj01
Contents: ['data', 'results']


In [2]:
import pandas as pd

In [3]:
hg38_df = pd.read_csv(
    f"{DATA_DIR}/hg38/human_promoter_vs_nonpromoter_10k_400bp.csv"
)

hg38_df = hg38_df[["sequence", "label"]]
hg38_df.head()

Unnamed: 0,sequence,label
0,TGAACCCCGGGAGGCAAGGGCTGCCATGGCAGGGGTGGGGTTTCAT...,0
1,GGCCCAGCTCTGACGCCAGGCTGTCTTGCCTCTGCTCACCTGCAGC...,1
2,TCATGCCTGGCCAGCAAAATTGTTTTTTAAAAGTTTATGCTACTAA...,1
3,AAGTTAAATAAATCAGGGTTTTCACCTGGTTCTTTAAGATCTGTTG...,0
4,AATGGAAGAAGCCAAAATTTTGCAGAACAAGAGAATATGCAAGAGA...,0


## One-Hot Encode DNA Sequences

In [4]:
import numpy as np

In [5]:
BASE2IDX = {
    "A": 0,
    "C": 1,
    "G": 2,
    "T": 3
}

def one_hot_encode(seq):
    arr = np.zeros((len(seq), 4), dtype=np.float32)
    for i, base in enumerate(seq):
        if base in BASE2IDX:
            arr[i, BASE2IDX[base]] = 1.0
    return arr

X = np.stack([one_hot_encode(seq) for seq in hg38_df["sequence"].values])
y = hg38_df["label"].values.astype(np.int64)

print(X.shape)

(19994, 400, 4)


## Train / Validation Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(X_train.shape, X_test.shape)

(15995, 400, 4) (3999, 400, 4)


## Build the CNN Model (Keras + TensorFlow)

In [8]:
!nvidia-smi

Tue Jan 13 13:26:04 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             55W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [9]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [10]:
from keras import Sequential, Input
from keras.layers import (
    Conv1D, MaxPooling1D, GlobalMaxPooling1D,
    Dense, Dropout, BatchNormalization
)

In [11]:
model = Sequential([
    Input(shape=(400, 4)),

    Conv1D(64, kernel_size=15, activation="relu"),
    BatchNormalization(),
    MaxPooling1D(pool_size=4),

    Conv1D(128, kernel_size=7, activation="relu"),
    BatchNormalization(),
    GlobalMaxPooling1D(),

    Dense(128, activation="relu"),
    Dropout(0.5),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

## Train the CNN

In [12]:
from keras.callbacks import EarlyStopping

In [13]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=20,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

pd.DataFrame(history.history)

Epoch 1/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.6187 - loss: 0.8144 - val_accuracy: 0.6550 - val_loss: 0.6340
Epoch 2/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7303 - loss: 0.5050 - val_accuracy: 0.7487 - val_loss: 0.5030
Epoch 3/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7693 - loss: 0.4704 - val_accuracy: 0.7262 - val_loss: 0.4922
Epoch 4/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7861 - loss: 0.4419 - val_accuracy: 0.7494 - val_loss: 0.4827
Epoch 5/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7978 - loss: 0.4229 - val_accuracy: 0.7538 - val_loss: 0.4580
Epoch 6/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8086 - loss: 0.4017 - val_accuracy: 0.7544 - val_loss: 0.4642
Epoch 7/20
[1m225/225[0m

Unnamed: 0,accuracy,loss,val_accuracy,val_loss
0,0.671761,0.628985,0.655,0.634005
1,0.733519,0.505967,0.74875,0.502952
2,0.762834,0.475298,0.72625,0.492238
3,0.778117,0.453768,0.749375,0.482654
4,0.791594,0.430024,0.75375,0.457989
5,0.802918,0.408058,0.754375,0.464235
6,0.819937,0.388077,0.740625,0.52828
7,0.834317,0.359563,0.6875,0.604178


## Evaluate the CNN

In [14]:
from sklearn.metrics import (
    accuracy_score, f1_score,
    roc_auc_score, matthews_corrcoef
)

In [20]:
y_prob = model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

accuracy_sc = accuracy_score(y_test, y_pred)
f1_sc = f1_score(y_test, y_pred)
roc_auc_sc = roc_auc_score(y_test, y_prob)
mcc_sc = matthews_corrcoef(y_test, y_pred)
print("Accuracy :", accuracy_sc)
print("F1       :", f1_sc)
print("AUROC    :", roc_auc_sc)
print("MCC      :", mcc_sc)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy : 0.7579394848712178
F1       : 0.7334801762114538
AUROC    : 0.8456988494247124
MCC      : 0.5247231136497288


## Save Reports to Drive

### Create Output Directory

In [16]:
OUT_DIR = os.path.join(PROJECT_DIR, "results/cnn")
os.makedirs(OUT_DIR, exist_ok=True)

### Save Performance Metrics

In [29]:
conv_layers = sum(
    isinstance(layer, Conv1D) for layer in model.layers
)

cnn_results = pd.DataFrame([{
    "model": "CNN",
    "accuracy": accuracy_sc,
    "f1": f1_sc,
    "auroc": roc_auc_sc,
    "mcc": mcc_sc,
    "input_length_bp": X_train.shape[1],
    "encoding": "one-hot",
    "conv_layers": conv_layers,
    "evaluation": "80/20 holdout"
}])

cnn_results.to_csv(
    os.path.join(OUT_DIR, "cnn_performance.csv"),
    index=False
)

cnn_results

Unnamed: 0,model,accuracy,f1,auroc,mcc,input_length_bp,encoding,conv_layers,evaluation
0,CNN,0.757939,0.73348,0.845699,0.524723,400,one-hot,2,80/20 holdout


### Save Training History

In [27]:
history_df = pd.DataFrame(history.history)
history_df.to_csv(
    os.path.join(OUT_DIR, "cnn_training_history.csv"),
    index=False
)

## Re-generate ROC Probabilities

In [28]:
y_score = model.predict(X_test).ravel()

roc_df = pd.DataFrame({
    "y_true": y_test,
    "y_score": y_score
})

roc_df.to_csv(
    os.path.join(OUT_DIR, "cnn_roc_data.csv"),
    index=False
)

roc_df.head()

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


Unnamed: 0,y_true,y_score
0,1,0.951395
1,1,0.259997
2,1,0.275677
3,0,0.208968
4,0,0.77353
