<a href="https://colab.research.google.com/github/zk2275/11-785-idl-project-group-38/blob/main/colab_baseline_models_(4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# === 1. Install and Import Libraries ===

# Install xlrd for reading .xls files.
!pip install xlrd

import os
import zipfile
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
import shutil

# Scipy for signal processing (peak finding)
from scipy.signal import find_peaks

# Sklearn for metrics
from sklearn.metrics import mean_absolute_error

# TensorFlow/Keras for Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.layers import Add, Embedding, Flatten
from tensorflow.keras.callbacks import EarlyStopping

# Suppress warnings
warnings.filterwarnings('ignore')



In [6]:
# === 2. Mount Drive & Define Data Functions ===

from google.colab import drive
drive.mount('/content/drive')

def unzip_data(zip_path, extract_folder):
    """Unzips a file and returns a list of all .csv files inside."""
    if not os.path.exists(zip_path):
        print(f"Error: {zip_path} not found. Check your Google Drive path.")
        return []
    os.makedirs(extract_folder, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)
    csv_files = glob.glob(os.path.join(extract_folder, '**/*.csv'), recursive=True)
    print(f"Extracted {len(csv_files)} files from {zip_path}")
    return csv_files

def create_sequences(df, seq_length=250, step=125):
    """Creates overlapping sequences from a single recording."""
    ecg = df['ECG'].values
    ppg = df['PPG'].values
    abp = df['ABP'].values

    # Normalize signals
    ecg = (ecg - np.mean(ecg)) / (np.std(ecg) + 1e-6)
    ppg = (ppg - np.mean(ppg)) / (np.std(ppg) + 1e-6)

    X_seq = []
    y_seq = []

    for i in range(0, len(df) - seq_length, step):
        end_idx = i + seq_length
        X_window = np.stack([ecg[i:end_idx], ppg[i:end_idx]], axis=-1)
        abp_window = abp[i:end_idx]
        abp_peaks, _ = find_peaks(abp_window, distance=int(0.5*125))
        abp_troughs, _ = find_peaks(-abp_window, distance=int(0.5*125))

        if len(abp_peaks) > 0 and len(abp_troughs) > 0:
            mean_sbp = np.mean(abp_window[abp_peaks])
            mean_dbp = np.mean(abp_window[abp_troughs])

            if 10 < mean_sbp < 300 and 10 < mean_dbp < 200:
                X_seq.append(X_window)
                y_seq.append([mean_sbp, mean_dbp])

    return np.array(X_seq), np.array(y_seq)

def load_and_process(zip_path, extract_folder, seq_length=250, debug_limit=None):
    """Main function to load zips and process all files for sequence models."""
    file_list = unzip_data(zip_path, extract_folder)
    if debug_limit is not None:
        file_list = file_list[:debug_limit]
        print(f"--- DEBUG MODE: Processing only {len(file_list)} files. ---")

    if not file_list: return np.array([]), np.array([])
    all_X, all_y = [], []

    for f in tqdm(file_list, desc=f"Processing {zip_path}"):
        try:
            df = pd.read_csv(f)
        except Exception as e:
            print(f"Could not read {f}: {e}")
            continue
        if not all(col in df.columns for col in ['t_sec', 'ECG', 'PPG', 'ABP']):
            print(f"Skipping {f}: missing required columns.")
            continue

        X, y = create_sequences(df, seq_length=seq_length)
        if X.shape[0] > 0:
            all_X.append(X)
            all_y.append(y)

    if not all_X:
        print(f"No valid data found in {zip_path} for sequence mode.")
        return np.array([]), np.array([])

    all_X = np.concatenate(all_X, axis=0)
    all_y = np.concatenate(all_y, axis=0)
    print(f"Finished processing {zip_path}. Found {all_X.shape[0]} samples.")
    return all_X, all_y

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# === 3. Transformer Model Definition ===

def transformer_encoder_block(inputs, head_size, num_heads, ff_dim, dropout=0):
    """Creates a single Transformer encoder block."""
    # Attention and Normalization
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = Dropout(dropout)(x)
    res = Add()([x, inputs]) # Residual connection

    # Feed-Forward Network and Normalization
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return Add()([x, res]) # Second residual connection

def build_transformer_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    """Builds a Transformer-based model for sequence regression."""
    inputs = Input(shape=input_shape)
    x = inputs

    # --- Create an "Embedding" for the time-series data ---
    # We project the 2 features (ECG, PPG) into a higher-dimensional space (e.g., 64)
    # This is analogous to token embedding in NLP.
    embed_dim = 64
    x = Dense(embed_dim)(x)

    # --- Positional Encoding ---
    # We add a simple learned positional embedding.
    positions = tf.range(start=0, limit=input_shape[0], delta=1)
    position_embedding = Embedding(input_dim=input_shape[0], output_dim=embed_dim)(positions)
    x = x + position_embedding

    # --- Create Transformer Blocks ---
    for _ in range(num_transformer_blocks):
        x = transformer_encoder_block(x, head_size, num_heads, ff_dim, dropout)

    # --- Final Head for Regression ---
    # GlobalAveragePooling1D averages the output of all time steps.
    x = GlobalAveragePooling1D(data_format="channels_last")(x)

    for dim in mlp_units:
        x = Dense(dim, activation="relu")(x)
        x = Dropout(mlp_dropout)(x)

    outputs = Dense(2, activation="linear")(x) # 2 outputs: SBP, DBP
    return Model(inputs, outputs)

In [8]:
# === 4. Transformer Model Training and Evaluation ===

print("\n--- Starting Transformer Model ---")

# 1. Define Model Parameters
SEQ_LENGTH = 250  # 2 seconds of data at 125 Hz
STEP = 125        # 1 second step (50% overlap)
NUM_FEATURES = 2  # ECG, PPG
NUM_OUTPUTS = 2   # SBP, DBP
BATCH_SIZE = 64
EPOCHS = 20

# --- Transformer-specific Hyperparameters ---
HEAD_SIZE = 256
NUM_HEADS = 4
FF_DIM = 256      # Hidden layer size in Feed-Forward network
NUM_BLOCKS = 3    # Number of Transformer blocks
MLP_UNITS = [128] # Dense units for final regressor head
DROPOUT = 0.1
MLP_DROPOUT = 0.2
# ---------------------------------------------

# --- Define Paths ---
# !!! EDIT THESE PATHS !!!
train_zip_path = '/content/drive/MyDrive/11785FinalData/train.zip'
val_zip_path = '/content/drive/MyDrive/11785FinalData/val.zip'
test_zip_path = '/content/drive/MyDrive/11785FinalData/test.zip'

# 2. Load and process data
X_train_seq, y_train_seq = load_and_process(train_zip_path, 'data/train', seq_length=SEQ_LENGTH)
X_val_seq, y_val_seq = load_and_process(val_zip_path, 'data/val', seq_length=SEQ_LENGTH)
X_test_seq, y_test_seq = load_and_process(test_zip_path, 'data/test', seq_length=SEQ_LENGTH)

if X_train_seq.shape[0] == 0:
    print("No training data found for sequence-based model. Aborting.")
else:
    print(f"Training data shape: {X_train_seq.shape}")
    print(f"Training labels shape: {y_train_seq.shape}")

    # 3. Build and compile the Transformer model
    input_shape = (SEQ_LENGTH, NUM_FEATURES)

    model = build_transformer_model(
        input_shape,
        head_size=HEAD_SIZE,
        num_heads=NUM_HEADS,
        ff_dim=FF_DIM,
        num_transformer_blocks=NUM_BLOCKS,
        mlp_units=MLP_UNITS,
        dropout=DROPOUT,
        mlp_dropout=MLP_DROPOUT,
    )

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()

    # 4. Train Model
    print("\nTraining Transformer model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train_seq, y_train_seq,
        validation_data=(X_val_seq, y_val_seq),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping],
        verbose=1
    )

    # 5. Evaluate on Test Set
    print("\nEvaluating Transformer on test set...")
    y_pred_seq = model.predict(X_test_seq)

    sbp_true_seq = y_test_seq[:, 0]
    dbp_true_seq = y_test_seq[:, 1]
    sbp_pred_seq = y_pred_seq[:, 0]
    dbp_pred_seq = y_pred_seq[:, 1]

    sbp_errors_seq = sbp_true_seq - sbp_pred_seq
    dbp_errors_seq = dbp_true_seq - dbp_pred_seq

    sbp_mae_seq = mean_absolute_error(sbp_true_seq, sbp_pred_seq)
    sbp_sd_seq = np.std(sbp_errors_seq)
    dbp_mae_seq = mean_absolute_error(dbp_true_seq, dbp_pred_seq)
    dbp_sd_seq = np.std(dbp_errors_seq)

    # 6. Report Results
    print("\n--- Transformer Model Test Results ---")
    print(f"SBP MAE: {sbp_mae_seq:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"SBP SD:  {sbp_sd_seq:.2f} mmHg (BHS Grade A: <= 8)")
    print(f"DBP MAE: {dbp_mae_seq:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"DBP SD:  {dbp_sd_seq:.2f} mmHg (BHS Grade A: <= 8)")
    print("--------------------------------------")


--- Starting Transformer Model ---
Extracted 441206 files from /content/drive/MyDrive/11785FinalData/train.zip


Processing /content/drive/MyDrive/11785FinalData/train.zip:   0%|          | 0/441206 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/train.zip. Found 1710341 samples.
Extracted 441185 files from /content/drive/MyDrive/11785FinalData/val.zip


Processing /content/drive/MyDrive/11785FinalData/val.zip:   0%|          | 0/441185 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/val.zip. Found 1710083 samples.
Extracted 441197 files from /content/drive/MyDrive/11785FinalData/test.zip


Processing /content/drive/MyDrive/11785FinalData/test.zip:   0%|          | 0/441197 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/test.zip. Found 1709981 samples.
Training data shape: (1710341, 250, 2)
Training labels shape: (1710341, 2)



Training Transformer model...
Epoch 1/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3811s[0m 142ms/step - loss: 468.9128 - val_loss: 297.2719
Epoch 2/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3771s[0m 141ms/step - loss: 329.7300 - val_loss: 268.6257
Epoch 3/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3767s[0m 141ms/step - loss: 293.7611 - val_loss: 255.0384
Epoch 4/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3774s[0m 141ms/step - loss: 275.8367 - val_loss: 241.3896
Epoch 5/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3769s[0m 141ms/step - loss: 264.9963 - val_loss: 232.2685
Epoch 6/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3769s[0m 141ms/step - loss: 255.0984 - val_loss: 229.9326
Epoch 7/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3772s[0m 141ms/step - loss: 246.7150 - val_loss: 226.9917
Epoch 8/20
[1m26725/26725[0m [32m━━

In [9]:
# === 5. Save a Trained Model to Your Drive ===

# First, create a path to a folder in your Google Drive
save_folder = '/content/drive/My Drive/MyProject'
os.makedirs(save_folder, exist_ok=True)

# Define the full path to save your model file
model_save_path = os.path.join(save_folder, 'transformer_model.keras')

# Save the model
try:
    model.save(model_save_path)
    print(f"Model successfully saved to: {model_save_path}")
except NameError:
    print("Could not save model. Make sure you have trained the model and it is in a variable named 'model'.")
except Exception as e:
    print(f"An error occurred while saving: {e}")

Model successfully saved to: /content/drive/My Drive/MyProject/transformer_model.keras
