<a href="https://colab.research.google.com/github/xysu129/deep-learning-ust-2026/blob/main/Assignment_4_SpliceFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4 - SpliceFinder: CPU vs GPU Training


## 1. Setup and Data Preparation

In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import time

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
# Download data from Google Drive
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7' -O "label.txt"
!pip install gdown -q
!gdown --id '1Sh2ce0jo5FVGNsSa9fqLjqcAOWQBFhzz' -O "encoded_seq.txt"

--2026-02-10 21:27:36--  https://drive.google.com/uc?export=download&id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7
Resolving drive.google.com (drive.google.com)... 142.251.2.101, 142.251.2.102, 142.251.2.138, ...
Connecting to drive.google.com (drive.google.com)|142.251.2.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7&export=download [following]
--2026-02-10 21:27:36--  https://drive.usercontent.google.com/download?id=1QbOSExVJEbPMhjzaua5n2eIXeF3qELQ7&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.2.132, 2607:f8b0:4023:c0d::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60000 (59K) [application/octet-stream]
Saving to: ‘label.txt’


2026-02-10 21:27:37 (2.21 MB/s) - ‘label.txt’ saved [60000/60000]

Downloadin

In [3]:
Length = 400

def load_data():

    labels = np.loadtxt('label.txt')
    encoded_seq = np.loadtxt('encoded_seq.txt')
    encoded_seq_choose = encoded_seq[:, ((400-Length)*2):(1600-(400-Length)*2)]

    print(encoded_seq_choose.shape)
    x_train,x_test,y_train,y_test = train_test_split(encoded_seq_choose,labels,test_size=0.2)

    return np.array(x_train),np.array(y_train),np.array(x_test),np.array(y_test)


x_train,y_train,x_test,y_test = load_data()

# Reshape flat arrays into (samples, 400, 4) for Conv1D
x_train = x_train.reshape(-1, Length, 4)
x_test = x_test.reshape(-1, Length, 4)
print(f"x_train: {x_train.shape}, x_test: {x_test.shape}")

(30000, 1600)
x_train: (24000, 400, 4), x_test: (6000, 400, 4)


## 2. Model Definition

In [5]:
def build_splice_finder():
    """Build and compile the SpliceFinder CNN model."""
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(Length, 4)),
        tf.keras.layers.Conv1D(50, kernel_size=8, activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Preview model structure
build_splice_finder().summary()

## 3. Training Function.

In [8]:
def train_on_device(device_name):
    """Train SpliceFinder on specified device and return timing + history."""
    print(f"\n{'='*50}")
    print(f"Training on {device_name}")
    print(f"{'='*50}")

    es = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=3, restore_best_weights=True
    )

    with tf.device(device_name):
        model = build_splice_finder()
        t0 = time.time()
        history = model.fit(
            x_train, y_train,
            validation_data=(x_test, y_test),
            epochs=50,
            batch_size=64,
            callbacks=[es],
            verbose=1
        )
        elapsed = time.time() - t0

        loss, acc = model.evaluate(x_test, y_test, verbose=0)

    print(f"\n{device_name} - Time: {elapsed:.2f}s | "
          f"Epochs: {len(history.history['loss'])} | "
          f"Test Acc: {acc:.4f}")

    return elapsed, history, acc

## 4. Run on CPU

In [9]:
cpu_time, cpu_hist, cpu_acc = train_on_device('/CPU:0')


Training on /CPU:0
Epoch 1/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 46ms/step - accuracy: 0.7344 - loss: 0.7256 - val_accuracy: 0.9680 - val_loss: 0.1220
Epoch 2/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 48ms/step - accuracy: 0.9678 - loss: 0.1078 - val_accuracy: 0.9677 - val_loss: 0.1147
Epoch 3/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 44ms/step - accuracy: 0.9787 - loss: 0.0730 - val_accuracy: 0.9692 - val_loss: 0.1047
Epoch 4/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 42ms/step - accuracy: 0.9870 - loss: 0.0501 - val_accuracy: 0.9655 - val_loss: 0.1165
Epoch 5/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 44ms/step - accuracy: 0.9916 - loss: 0.0327 - val_accuracy: 0.9685 - val_loss: 0.1101
Epoch 6/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 42ms/step - accuracy: 0.9927 - loss: 0.0276 - val_accuracy: 0.9700 - val_loss: 0.1

## 5. Run on GPU

In [10]:
if tf.config.list_physical_devices('GPU'):
    gpu_time, gpu_hist, gpu_acc = train_on_device('/GPU:0')
else:
    print("No GPU available. Please set Runtime > Change runtime type > GPU.")
    gpu_time, gpu_acc = None, None


Training on /GPU:0
Epoch 1/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7695 - loss: 0.6083 - val_accuracy: 0.9657 - val_loss: 0.1133
Epoch 2/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9707 - loss: 0.1003 - val_accuracy: 0.9683 - val_loss: 0.1095
Epoch 3/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9801 - loss: 0.0680 - val_accuracy: 0.9643 - val_loss: 0.1186
Epoch 4/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9870 - loss: 0.0466 - val_accuracy: 0.9627 - val_loss: 0.1211
Epoch 5/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9889 - loss: 0.0392 - val_accuracy: 0.9613 - val_loss: 0.1337

/GPU:0 - Time: 12.00s | Epochs: 5 | Test Acc: 0.9683


## 6. Results Comparison

In [11]:
print("\n" + "=" * 40)
print("  CPU vs GPU Training Comparison")
print("=" * 40)
print(f"  CPU time:     {cpu_time:.2f}s  |  Accuracy: {cpu_acc:.4f}")
if gpu_time:
    print(f"  GPU time:     {gpu_time:.2f}s  |  Accuracy: {gpu_acc:.4f}")
    print(f"  Speedup:      {cpu_time / gpu_time:.2f}x")
else:
    print("  GPU:          N/A")
print("=" * 40)


  CPU vs GPU Training Comparison
  CPU time:     114.47s  |  Accuracy: 0.9692
  GPU time:     12.00s  |  Accuracy: 0.9683
  Speedup:      9.54x
