<a href="https://colab.research.google.com/github/zk2275/11-785-idl-project-group-38/blob/main/colab_baseline_models_CNN%2BGRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# === 1. Install and Import Libraries ===

# Install xlrd for reading .xls files. Remove if your files are .xlsx
!pip install xlrd

import os
import zipfile
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
import shutil

# Scipy for signal processing (peak finding)
from scipy.signal import find_peaks

# Sklearn for metrics
from sklearn.metrics import mean_absolute_error

# TensorFlow/Keras for Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Bidirectional, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

# Suppress warnings
warnings.filterwarnings('ignore')



In [2]:
# === 2. Mount Google Drive ===
# This connects Colab to your Google Drive folder

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# === 3. Define Data Loading and Preprocessing Functions ===

def unzip_data(zip_path, extract_folder):
    """Unzips a file and returns a list of all .csv files inside."""
    if not os.path.exists(zip_path):
        print(f"Error: {zip_path} not found. Check your Google Drive path.")
        return []

    os.makedirs(extract_folder, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    # Search recursively (**) for any .csv files in all subfolders.
    csv_files = glob.glob(os.path.join(extract_folder, '**/*.csv'), recursive=True)

    print(f"Extracted {len(csv_files)} files from {zip_path}")
    return csv_files

# --- Sequence Preparation Functions (for CNN+GRU) ---
def create_sequences(df, seq_length=250, step=125):
    """Creates overlapping sequences from a single recording."""
    ecg = df['ECG'].values
    ppg = df['PPG'].values
    abp = df['ABP'].values

    # Normalize signals
    ecg = (ecg - np.mean(ecg)) / (np.std(ecg) + 1e-6)
    ppg = (ppg - np.mean(ppg)) / (np.std(ppg) + 1e-6)

    X_seq = []
    y_seq = []

    for i in range(0, len(df) - seq_length, step):
        end_idx = i + seq_length

        X_window = np.stack([ecg[i:end_idx], ppg[i:end_idx]], axis=-1)

        abp_window = abp[i:end_idx]
        abp_peaks, _ = find_peaks(abp_window, distance=int(0.5*125))
        abp_troughs, _ = find_peaks(-abp_window, distance=int(0.5*125))

        if len(abp_peaks) > 0 and len(abp_troughs) > 0:
            mean_sbp = np.mean(abp_window[abp_peaks])
            mean_dbp = np.mean(abp_window[abp_troughs])

            if 10 < mean_sbp < 300 and 10 < mean_dbp < 200:
                X_seq.append(X_window)
                y_seq.append([mean_sbp, mean_dbp])

    return np.array(X_seq), np.array(y_seq)

# --- Main Data Processing Loop ---
def load_and_process(zip_path, extract_folder, seq_length=250, debug_limit=None):
    """Main function to load zips and process all files for sequence models."""

    file_list = unzip_data(zip_path, extract_folder)

    # If a debug_limit is set, only use a small slice of the file list
    if debug_limit is not None:
        file_list = file_list[:debug_limit]
        print(f"--- DEBUG MODE: Processing only {len(file_list)} files. ---")

    if not file_list:
        return np.array([]), np.array([])

    all_X = []
    all_y = []

    for f in tqdm(file_list, desc=f"Processing {zip_path}"):
        try:
            df = pd.read_csv(f)
        except Exception as e:
            print(f"Could not read {f}: {e}")
            continue

        if not all(col in df.columns for col in ['t_sec', 'ECG', 'PPG', 'ABP']):
            print(f"Skipping {f}: missing required columns.")
            continue

        # This function now ONLY creates sequences
        X, y = create_sequences(df, seq_length=seq_length)
        if X.shape[0] > 0:
            all_X.append(X)
            all_y.append(y)

    if not all_X:
        print(f"No valid data found in {zip_path} for sequence mode.")
        return np.array([]), np.array([])

    all_X = np.concatenate(all_X, axis=0)
    all_y = np.concatenate(all_y, axis=0)

    print(f"Finished processing {zip_path}. Found {all_X.shape[0]} samples.")
    return all_X, all_y

In [8]:
#  !!! EDIT THESE PATHS !!!
#    Define the paths to those files in your Google Drive
train_zip_path = '/content/drive/MyDrive/11785FinalData/train.zip'
val_zip_path = '/content/drive/MyDrive/11785FinalData/val.zip'
test_zip_path = '/content/drive/MyDrive/11785FinalData/test.zip'

In [9]:
# === 4. CNN + GRU Model (from Drive) ===

print("\n--- Starting CNN + GRU Model ---")

# 1. Define Model Parameters
SEQ_LENGTH = 250  # 2 seconds of data at 125 Hz
STEP = 125        # 1 second step (50% overlap)
NUM_FEATURES = 2  # ECG, PPG
NUM_OUTPUTS = 2   # SBP, DBP
BATCH_SIZE = 64   # You can tune this
EPOCHS = 20       # Keep low for a baseline test. Increase for real run.

# 2. Load and process data
# (We run load_and_process in 'sequence' mode)
X_train_seq, y_train_seq = load_and_process(train_zip_path, 'data/train', seq_length=SEQ_LENGTH)
X_val_seq, y_val_seq = load_and_process(val_zip_path, 'data/val', seq_length=SEQ_LENGTH)
X_test_seq, y_test_seq = load_and_process(test_zip_path, 'data/test', seq_length=SEQ_LENGTH)

if X_train_seq.shape[0] == 0:
    print("No training data found for sequence-based model. Aborting.")
else:
    print(f"Training data shape: {X_train_seq.shape}")
    print(f"Training labels shape: {y_train_seq.shape}")

    # 3. Define CNN + GRU Model Architecture

    def build_cnn_gru_model(input_shape):
        """
        Builds a CNN + GRU hybrid model.
        """
        inputs = Input(shape=input_shape)

        # CNN part for feature extraction
        x = Conv1D(filters=32, kernel_size=5, activation='relu', padding='same')(inputs)
        x = MaxPooling1D(pool_size=2, padding='same')(x)
        x = Dropout(0.2)(x)

        x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(x)
        x = MaxPooling1D(pool_size=2, padding='same')(x)
        x = Dropout(0.2)(x)

        # GRU part for sequence modeling
        x = Bidirectional(GRU(64, return_sequences=False))(x)
        x = Dropout(0.3)(x)

        # Dense layers for regression
        x = Dense(32, activation='relu')(x)

        # Final output layer for SBP and DBP
        outputs = Dense(NUM_OUTPUTS, activation='linear')(x) # 2 outputs: SBP and DBP

        model = Model(inputs, outputs)
        return model

    # Build and compile the model
    input_shape = (SEQ_LENGTH, NUM_FEATURES)
    model = build_cnn_gru_model(input_shape)

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()

    # 4. Train Model
    print("\nTraining CNN + GRU model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train_seq, y_train_seq,
        validation_data=(X_val_seq, y_val_seq),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping],
        verbose=1
    )

    # 5. Evaluate on Test Set
    print("\nEvaluating CNN + GRU on test set...")
    y_pred_seq = model.predict(X_test_seq)

    sbp_true_seq = y_test_seq[:, 0]
    dbp_true_seq = y_test_seq[:, 1]
    sbp_pred_seq = y_pred_seq[:, 0]
    dbp_pred_seq = y_pred_seq[:, 1]

    sbp_errors_seq = sbp_true_seq - sbp_pred_seq
    dbp_errors_seq = dbp_true_seq - dbp_pred_seq

    sbp_mae_seq = mean_absolute_error(sbp_true_seq, sbp_pred_seq)
    sbp_sd_seq = np.std(sbp_errors_seq)
    dbp_mae_seq = mean_absolute_error(dbp_true_seq, dbp_pred_seq)
    dbp_sd_seq = np.std(dbp_errors_seq)

    # 6. Report Results
    print("\n--- CNN + GRU Model Test Results ---")
    print(f"SBP MAE: {sbp_mae_seq:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"SBP SD:  {sbp_sd_seq:.2f} mmHg (BHS Grade A: <= 8)")
    print(f"DBP MAE: {dbp_mae_seq:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"DBP SD:  {dbp_sd_seq:.2f} mmHg (BHS Grade A: <= 8)")
    print("----------------------------------")


--- Starting CNN + GRU Model ---
Extracted 441206 files from /content/drive/MyDrive/11785FinalData/train.zip


Processing /content/drive/MyDrive/11785FinalData/train.zip:   0%|          | 0/441206 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/train.zip. Found 1710341 samples.
Extracted 441185 files from /content/drive/MyDrive/11785FinalData/val.zip


Processing /content/drive/MyDrive/11785FinalData/val.zip:   0%|          | 0/441185 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/val.zip. Found 1710083 samples.
Extracted 441197 files from /content/drive/MyDrive/11785FinalData/test.zip


Processing /content/drive/MyDrive/11785FinalData/test.zip:   0%|          | 0/441197 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/test.zip. Found 1709981 samples.
Training data shape: (1710341, 250, 2)
Training labels shape: (1710341, 2)



Training CNN + GRU model...
Epoch 1/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 13ms/step - loss: 629.6055 - val_loss: 207.6379
Epoch 2/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 13ms/step - loss: 232.4801 - val_loss: 176.1951
Epoch 3/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 12ms/step - loss: 187.0794 - val_loss: 166.6837
Epoch 4/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 13ms/step - loss: 172.2007 - val_loss: 156.9126
Epoch 5/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 13ms/step - loss: 164.4725 - val_loss: 147.1877
Epoch 6/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 13ms/step - loss: 158.9685 - val_loss: 144.2929
Epoch 7/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 13ms/step - loss: 156.4079 - val_loss: 143.9326
Epoch 8/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━

In [10]:
# === 5. Save a Trained Model to Your Drive ===

# First, create a path to a folder in your Google Drive
save_folder = '/content/drive/My Drive/MyProject'
os.makedirs(save_folder, exist_ok=True)

# Define the full path to save your model file
model_save_path = os.path.join(save_folder, 'cnn_gru_model.keras')

# Save the model
try:
    model.save(model_save_path)
    print(f"Model successfully saved to: {model_save_path}")
except NameError:
    print("Could not save model. Make sure you have trained the model and it is in a variable named 'model'.")
except Exception as e:
    print(f"An error occurred while saving: {e}")

Model successfully saved to: /content/drive/My Drive/MyProject/cnn_gru_model.keras
