In [None]:
# === 1. Install and Import Libraries ===

# Install xlrd for reading .xls files. Remove if your files are .xlsx
!pip install xlrd

import os
import zipfile
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
import shutil

# Scipy for signal processing (peak finding)
from scipy.signal import find_peaks

# Sklearn for Linear Regression Baseline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# TensorFlow/Keras for LSTM Baseline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

# Suppress warnings
warnings.filterwarnings('ignore')



In [None]:
# === 2. Mount Google Drive ===
# This connects Colab to your Google Drive folder

from google.colab import drive
drive.mount('/content/drive')

# After running this, look for a folder named 'drive' in the file panel on the left.
# Inside 'drive/My Drive/' is your entire Google Drive.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# === 3. Define Data Loading and Preprocessing Functions ===
# (This cell must be run to define the 'load_and_process' function)

def unzip_data(zip_path, extract_folder):
    """Unzips a file and returns a list of all .csv files inside."""
    if not os.path.exists(zip_path):
        print(f"Error: {zip_path} not found. Check your Google Drive path.")
        return []

    os.makedirs(extract_folder, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    # --- FIX IS HERE ---
    # Search recursively (**) for any .csv files in all subfolders.
    csv_files = glob.glob(os.path.join(extract_folder, '**/*.csv'), recursive=True)
    # -------------------

    print(f"Extracted {len(csv_files)} files from {zip_path}")
    return csv_files

# --- Feature Engineering Functions (for Linear Regression) ---
def get_features_and_labels(df, fs=125):
    """
    --- MODIFICATION ---
    Extracts ECG-only features (HR, R-amplitude, etc.) and labels (SBP, DBP).
    """
    ecg = df['ECG'].values
    # --- MODIFICATION: REMOVED PPG ---
    # ppg = df['PPG'].values
    abp = df['ABP'].values

    # Find peaks. These parameters will likely need tuning for real data.
    # Distance is set based on 125Hz fs (e.g., 75 samples = 0.6s = 100bpm)
    ecg_peaks, _ = find_peaks(ecg, height=np.mean(ecg) + 1.0 * np.std(ecg), distance=fs*0.5)
    # --- MODIFICATION: REMOVED PPG ---
    # ppg_peaks, _ = find_peaks(ppg, height=np.mean(ppg) + 0.5 * np.std(ppg), distance=fs*0.5)
    abp_peaks, _ = find_peaks(abp, height=np.mean(abp), distance=fs*0.5)
    abp_troughs, _ = find_peaks(-abp, height=-np.mean(abp), distance=fs*0.5)

    features = []
    labels = []

    # Iterate through cardiac cycles defined by ECG R-peaks
    for i in range(len(ecg_peaks) - 1):
        start_idx = ecg_peaks[i]
        end_idx = ecg_peaks[i+1]

        # --- MODIFICATION: REMOVED PPG PEAK FINDING ---
        # ppg_peak_in_cycle = ppg_peaks[(ppg_peaks > start_idx) & (ppg_peaks < end_idx)]
        # if len(ppg_peak_in_cycle) == 0:
        #     continue
        # first_ppg_peak_idx = ppg_peak_in_cycle[0]

        abp_peak_in_cycle = abp_peaks[(abp_peaks > start_idx) & (abp_peaks < end_idx)]
        abp_trough_in_cycle = abp_troughs[(abp_troughs > start_idx) & (abp_troughs < end_idx)]

        if len(abp_peak_in_cycle) == 0 or len(abp_trough_in_cycle) == 0:
            continue

        # --- Calculate Features ---
        # --- MODIFICATION: REMOVED PTT & PPG AMP ---
        # ptt = (first_ppg_peak_idx - start_idx) / fs
        rr_interval_sec = (end_idx - start_idx) / fs
        hr = 60.0 / rr_interval_sec
        # ppg_amplitude = ppg[first_ppg_peak_idx]

        # --- MODIFICATION: ADDED ECG-ONLY FEATURES ---
        r_amplitude = ecg[start_idx] # R-peak Amplitude
        ecg_cycle_std = np.std(ecg[start_idx:end_idx]) # Variability in the cycle

        # --- Get Labels ---
        sbp = np.max(abp[start_idx:end_idx])
        dbp = np.min(abp[start_idx:end_idx])

        # --- MODIFICATION: REMOVED 'ptt > 0' from check ---
        if 10 < sbp < 300 and 10 < dbp < 200 and 30 < hr < 200:
            # features.append([ptt, hr, ppg_amplitude])
            features.append([hr, r_amplitude, ecg_cycle_std]) # Use new features
            labels.append([sbp, dbp])

    return np.array(features), np.array(labels)

# --- Sequence Preparation Functions (for LSTM) ---
def create_sequences(df, seq_length=250, step=125):
    """Creates overlapping sequences from a single recording."""
    ecg = df['ECG'].values
    # --- MODIFICATION: REMOVED PPG ---
    # ppg = df['PPG'].values
    abp = df['ABP'].values

    # Normalize ECG
    ecg = (ecg - np.mean(ecg)) / (np.std(ecg) + 1e-6)
    # --- MODIFICATION: REMOVED PPG ---
    # ppg = (ppg - np.mean(ppg)) / (np.std(ppg) + 1e-6)

    X_seq = []
    y_seq = []

    for i in range(0, len(df) - seq_length, step):
        end_idx = i + seq_length

        # --- MODIFICATION: USE ECG ONLY ---
        # Original: X_window = np.stack([ecg[i:end_idx], ppg[i:end_idx]], axis=-1)
        # New: Reshape ECG segment to (seq_length, 1) for the LSTM
        X_window = ecg[i:end_idx].reshape(seq_length, 1)
        # --- END MODIFICATION ---

        abp_window = abp[i:end_idx]
        abp_peaks, _ = find_peaks(abp_window, distance=int(0.5*125))
        abp_troughs, _ = find_peaks(-abp_window, distance=int(0.5*125))

        if len(abp_peaks) > 0 and len(abp_troughs) > 0:
            mean_sbp = np.mean(abp_window[abp_peaks])
            mean_dbp = np.mean(abp_window[abp_troughs])

            if 10 < mean_sbp < 300 and 10 < mean_dbp < 200:
                X_seq.append(X_window)
                y_seq.append([mean_sbp, mean_dbp])

    return np.array(X_seq), np.array(y_seq)

# --- Main Data Processing Loop ---
def load_and_process(zip_path, extract_folder, mode='features', seq_length=250, debug_limit=None):
    """Main function to load zips and process all files for a given mode."""

    # Use the corrected unzip_data function from the previous step
    file_list = unzip_data(zip_path, extract_folder)

    # --- THIS IS THE NEW DEBUG CODE ---
    # If a debug_limit is set, only use a small slice of the file list
    if debug_limit is not None:
        file_list = file_list[:debug_limit]
        print(f"--- DEBUG MODE: Processing only {len(file_list)} files. ---")
    # ----------------------------------

    if not file_list:
        return np.array([]), np.array([])

    all_X = []
    all_y = []

    for f in tqdm(file_list, desc=f"Processing {zip_path}"):
        try:
            df = pd.read_csv(f)
        except Exception as e:
            print(f"Could not read {f}: {e}")
            continue

        # --- MODIFICATION: REMOVED 'PPG' from required columns ---
        if not all(col in df.columns for col in ['t_sec', 'ECG', 'ABP']):
            print(f"Skipping {f}: missing required columns (t_sec, ECG, ABP).")
            continue
        # --- END MODIFICATION ---

        if mode == 'features':
            X, y = get_features_and_labels(df)
            if X.shape[0] > 0:
                all_X.append(X)
                all_y.append(y)
        elif mode == 'sequence':
            X, y = create_sequences(df, seq_length=seq_length)
            if X.shape[0] > 0:
                all_X.append(X)
                all_y.append(y)

    if not all_X:
        print(f"No valid data found in {zip_path} for mode '{mode}'.")
        return np.array([]), np.array([])

    all_X = np.concatenate(all_X, axis=0)
    all_y = np.concatenate(all_y, axis=0)

    print(f"Finished processing {zip_path}. Found {all_X.shape[0]} samples.")
    return all_X, all_y

In [None]:
# !!! EDIT THESE PATHS !!!
#  Define the paths to those files in your Google Drive
train_zip_path = '/content/drive/MyDrive/11785FinalData/train.zip'
val_zip_path = '/content/drive/MyDrive/11785FinalData/val.zip'
test_zip_path = '/content/drive/MyDrive/11785FinalData/test.zip'

In [None]:
# === 4. Baseline 1: Linear Regression (from Drive) ===

# (Paths are defined in the cell above, removing duplicate comments)

# --------------------
# Set Debug Mode
DEBUG_MODE = True
DEBUG_FILE_LIMIT = 100
# --------------------

print("--- Starting Baseline 1: Linear Regression ---")

# 1. Set the debug limit
limit = DEBUG_FILE_LIMIT if DEBUG_MODE else None

# 2. Load and process data
# (This will now use the modified get_features_and_labels function)
X_train_feat, y_train_feat = load_and_process(train_zip_path, 'data/train', mode='features', debug_limit=limit)
X_val_feat, y_val_feat = load_and_process(val_zip_path, 'data/val', mode='features', debug_limit=limit)
X_test_feat, y_test_feat = load_and_process(test_zip_path, 'data/test', mode='features', debug_limit=limit)

if X_train_feat.shape[0] == 0:
    print("No training data found for feature-based model. Aborting.")
else:
    # 4. Scale features
    feature_scaler = StandardScaler()
    X_train_scaled = feature_scaler.fit_transform(X_train_feat)
    X_val_scaled = feature_scaler.transform(X_val_feat)
    X_test_scaled = feature_scaler.transform(X_test_feat)

    # 5. Train model
    print("Training Linear Regression model...")
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_scaled, y_train_feat)

    # 6. Evaluate on Test Set
    print("Evaluating Linear Regression on test set...")
    y_pred_feat = lin_reg.predict(X_test_scaled)

    sbp_true = y_test_feat[:, 0]
    dbp_true = y_test_feat[:, 1]
    sbp_pred = y_pred_feat[:, 0]
    dbp_pred = y_pred_feat[:, 1]

    sbp_errors = sbp_true - sbp_pred
    dbp_errors = dbp_true - dbp_pred

    sbp_mae = mean_absolute_error(sbp_true, sbp_pred)
    sbp_sd = np.std(sbp_errors)
    dbp_mae = mean_absolute_error(dbp_true, dbp_pred)
    dbp_sd = np.std(dbp_errors)

    # 7. Report Results
    print("\n--- Linear Regression Test Results (ECG-Only) ---")
    print(f"SBP MAE: {sbp_mae:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"SBP SD:  {sbp_sd:.2f} mmHg (BHS Grade A: <= 8)")
    print(f"DBP MAE: {dbp_mae:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"DBP SD:  {dbp_sd:.2f} mmHg (BHS Grade A: <= 8)")
    print("-------------------------------------------------")

--- Starting Baseline 1: Linear Regression ---
--- Starting Baseline 1: Linear Regression ---
Extracted 441206 files from /content/drive/MyDrive/11785FinalData/train.zip
--- DEBUG MODE: Processing only 100 files. ---


Processing /content/drive/MyDrive/11785FinalData/train.zip:   0%|          | 0/100 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/train.zip. Found 552 samples.
Extracted 441185 files from /content/drive/MyDrive/11785FinalData/val.zip
--- DEBUG MODE: Processing only 100 files. ---


Processing /content/drive/MyDrive/11785FinalData/val.zip:   0%|          | 0/100 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/val.zip. Found 530 samples.
Extracted 441197 files from /content/drive/MyDrive/11785FinalData/test.zip
--- DEBUG MODE: Processing only 100 files. ---


Processing /content/drive/MyDrive/11785FinalData/test.zip:   0%|          | 0/100 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/test.zip. Found 611 samples.
Training Linear Regression model...
Evaluating Linear Regression on test set...

--- Linear Regression Test Results ---
SBP MAE: 20.86 mmHg (BHS Grade A: <= 5)
SBP SD:  23.93 mmHg (BHS Grade A: <= 8)
DBP MAE: 12.66 mmHg (BHS Grade A: <= 5)
DBP SD:  13.76 mmHg (BHS Grade A: <= 8)
----------------------------------------


In [None]:
# === 5. Baseline 2: LSTM Model (from Drive) ===
# (Corrected section number from 4 to 5)

print("\n--- Starting Baseline 2: LSTM Model ---")

# 1. Define Model Parameters
SEQ_LENGTH = 250  # 2 seconds of data at 125 Hz
STEP = 125        # 1 second step (50% overlap)
# --- MODIFICATION: CHANGED NUM_FEATURES from 2 to 1 ---
NUM_FEATURES = 1  # ECG only
# --- END MODIFICATION ---
NUM_OUTPUTS = 2   # SBP, DBP
BATCH_SIZE = 64
EPOCHS = 20       # Keep low for a baseline test. Increase for real run.

# 2. Use the same paths from Cell 4
# (This will now use the modified create_sequences function)
X_train_seq, y_train_seq = load_and_process(train_zip_path, 'data/train', mode='sequence', seq_length=SEQ_LENGTH)
X_val_seq, y_val_seq = load_and_process(val_zip_path, 'data/val', mode='sequence', seq_length=SEQ_LENGTH)
X_test_seq, y_test_seq = load_and_process(test_zip_path, 'data/test', mode='sequence', seq_length=SEQ_LENGTH)

if X_train_seq.shape[0] == 0:
    print("No training data found for sequence-based model. Aborting.")
else:
    print(f"Training data shape: {X_train_seq.shape}")
    print(f"Training labels shape: {y_train_seq.shape}")

    # 3. Define LSTM Model Architecture
    # (The Input layer will now correctly take shape (SEQ_LENGTH, 1))
    model = Sequential([
        Input(shape=(SEQ_LENGTH, NUM_FEATURES)),
        Bidirectional(LSTM(64, return_sequences=False)),
        Dense(32, activation='relu'),
        Dense(NUM_OUTPUTS) # Linear activation for regression
    ])

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()

    # 4. Train Model
    print("\nTraining LSTM model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train_seq, y_train_seq,
        validation_data=(X_val_seq, y_val_seq),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping],
        verbose=1
    )

    # 5. Evaluate on Test Set
    print("\nEvaluating LSTM on test set...")
    y_pred_seq = model.predict(X_test_seq)

    sbp_true_seq = y_test_seq[:, 0]
    dbp_true_seq = y_test_seq[:, 1]
    sbp_pred_seq = y_pred_seq[:, 0]
    dbp_pred_seq = y_pred_seq[:, 1]

    sbp_errors_seq = sbp_true_seq - sbp_pred_seq
    dbp_errors_seq = dbp_true_seq - dbp_pred_seq

    sbp_mae_seq = mean_absolute_error(sbp_true_seq, sbp_pred_seq)
    sbp_sd_seq = np.std(sbp_errors_seq)
    dbp_mae_seq = mean_absolute_error(dbp_true_seq, dbp_pred_seq)
    dbp_sd_seq = np.std(dbp_errors_seq)

    # 6. Report Results
    print("\n--- LSTM Model Test Results (ECG-Only) ---")
    print(f"SBP MAE: {sbp_mae_seq:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"SBP SD:  {sbp_sd_seq:.2f} mmHg (BHS Grade A: <= 8)")
    print(f"DBP MAE: {dbp_mae_seq:.2f} mmHg (BHS Grade A: <= 5)")
    print(f"DBP SD:  {dbp_sd_seq:.2f} mmHg (BHS Grade A: <= 8)")
    print("------------------------------------------")


# === 6. Save a Trained Model to Your Drive ===

# First, create a path to a folder in your Google Drive
# You can change 'MyProject' to any folder name you want.
save_folder = '/content/drive/My Drive/MyProject'
os.makedirs(save_folder, exist_ok=True)

# Define the full path to save your model file
model_save_path = os.path.join(save_folder, 'baseline_lstm_model_ECG_ONLY.keras') # Modified name

# Save the model (assuming your model variable is named 'model')
try:
    model.save(model_save_path)
    print(f"Model successfully saved to: {model_save_path}")
except NameError:
    print("Could not save model. Make sure you have trained the model and it is in a variable named 'model'.")
except Exception as e:
    print(f"An error occurred while saving: {e}")


--- Starting Baseline 2: LSTM Model ---
Searching recursively for data files in data/train...
Found 441206 files from /content/drive/MyDrive/11785FinalData/train.zip (CSV: 441206, Excel: 0)


Processing /content/drive/MyDrive/11785FinalData/train.zip:   0%|          | 0/441206 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/train.zip. Found 1710341 samples.
Searching recursively for data files in data/val...
Found 441185 files from /content/drive/MyDrive/11785FinalData/val.zip (CSV: 441185, Excel: 0)


Processing /content/drive/MyDrive/11785FinalData/val.zip:   0%|          | 0/441185 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/val.zip. Found 1710083 samples.
Searching recursively for data files in data/test...
Found 441197 files from /content/drive/MyDrive/11785FinalData/test.zip (CSV: 441197, Excel: 0)


Processing /content/drive/MyDrive/11785FinalData/test.zip:   0%|          | 0/441197 [00:00<?, ?it/s]

Finished processing /content/drive/MyDrive/11785FinalData/test.zip. Found 1709981 samples.
Training data shape: (1710341, 250, 2)
Training labels shape: (1710341, 2)



Training LSTM model...
Epoch 1/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m643s[0m 24ms/step - loss: 656.0857 - val_loss: 367.0266
Epoch 2/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m631s[0m 24ms/step - loss: 328.5735 - val_loss: 255.8009
Epoch 3/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 24ms/step - loss: 244.7308 - val_loss: 210.0222
Epoch 4/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m633s[0m 24ms/step - loss: 207.4895 - val_loss: 195.0402
Epoch 5/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 24ms/step - loss: 190.5177 - val_loss: 180.2835
Epoch 6/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 24ms/step - loss: 180.3412 - val_loss: 174.2110
Epoch 7/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 24ms/step - loss: 171.9383 - val_loss: 165.8680
Epoch 8/20
[1m26725/26725[0m [32m━━━━━━━━━━━━━━━━━━━━[0