This block imports all standard libraries. We also install and import tensorflow-addons, which provides InstanceNormalization, a layer that is critical for high-quality GAN results.

In [None]:
# === 1. Install and Import Libraries ===

# Install tensorflow-addons for InstanceNormalization
!pip install -q tensorflow-addons

import os
import zipfile
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
import shutil
import time

# Sklearn for metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, LeakyReLU
from tensorflow.keras.layers import Conv1D, Conv1DTranspose
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.optimizers import Adam

# TensorFlow Addons
import tensorflow_addons as tfa

# Suppress warnings
warnings.filterwarnings('ignore')

This section contains the exact same data loading and processing functions as the previous models. Their job is to read the CSVs from the .zip files and convert them into NumPy arrays of paired (PPG_window, ECG_window) samples.

We will not change these, but in the next section, we will feed their output into a tf.data pipeline, which is necessary for training a GAN.

In [None]:
# === 2. Data Loading Functions ===

from google.colab import drive
drive.mount('/content/drive')

def unzip_data(zip_path, extract_folder):
    """Unzips a file and returns a list of all .csv files inside."""
    if not os.path.exists(zip_path):
        print(f"Error: {zip_path} not found. Check your Google Drive path.")
        return []
    os.makedirs(extract_folder, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)
    csv_files = glob.glob(os.path.join(extract_folder, '**/*.csv'), recursive=True)
    print(f"Extracted {len(csv_files)} files from {zip_path}")
    return csv_files

def create_sequences_ppg_to_ecg(df, seq_length=256, step=128):
    """
    Creates overlapping sequences for PPG-to-ECG translation.
    Input (X) is PPG signal.
    Output (y) is ECG signal.
    """
    ecg = df['ECG'].values
    ppg = df['PPG'].values

    # Normalize signals individually
    ecg = (ecg - np.mean(ecg)) / (np.std(ecg) + 1e-6)
    ppg = (ppg - np.mean(ppg)) / (np.std(ppg) + 1e-6)

    X_seq = []
    y_seq = []

    for i in range(0, len(df) - seq_length, step):
        end_idx = i + seq_length

        X_window = ppg[i:end_idx]
        y_window = ecg[i:end_idx]

        if np.std(X_window) > 0.1 and np.std(y_window) > 0.1:
            X_seq.append(X_window)
            y_seq.append(y_window)

    # Add a "channels" dimension
    return np.expand_dims(np.array(X_seq), -1), np.expand_dims(np.array(y_seq), -1)

def load_and_process(zip_path, extract_folder, seq_length=256, debug_limit=None):
    """Main function to load zips and process all files for sequence models."""
    file_list = unzip_data(zip_path, extract_folder)
    if debug_limit is not None:
        file_list = file_list[:debug_limit]
        print(f"--- DEBUG MODE: Processing only {len(file_list)} files. ---")

    if not file_list: return np.array([]), np.array([])
    all_X, all_y = [], []

    for f in tqdm(file_list, desc=f"Processing {zip_path}"):
        try:
            df = pd.read_csv(f)
        except Exception as e:
            print(f"Could not read {f}: {e}")
            continue
        if not all(col in df.columns for col in ['t_sec', 'ECG', 'PPG', 'ABP']):
            print(f"Skipping {f}: missing required columns.")
            continue

        X, y = create_sequences_ppg_to_ecg(df, seq_length=seq_length)
        if X.shape[0] > 0:
            all_X.append(X)
            all_y.append(y)

    if not all_X:
        print(f"No valid data found in {zip_path} for sequence mode.")
        return np.array([]), np.array([])

    all_X = np.concatenate(all_X, axis=0)
    all_y = np.concatenate(all_y, axis=0)
    print(f"Finished processing {zip_path}. Found {all_X.shape[0]} samples.")
    return all_X, all_y

This is a new, crucial step. GANs must be trained with a tf.data pipeline for performance. This code:

Sets your file paths (you must edit these!).

Loads the data into NumPy arrays using the functions from Section 2.

Converts these NumPy arrays into tf.data.Dataset objects.

Applies .shuffle(), .batch(), and .prefetch() to create a high-performance, GPU-ready data pipeline.

In [None]:
# === 3. Configuration & tf.data Pipeline ===

# --- Hyperparameters ---
SEQ_LENGTH = 256
STEP = 128
BUFFER_SIZE = 1000  # For shuffling
BATCH_SIZE = 1      # CycleGANs are often trained with batch_size=1
EPOCHS = 20

# --- Define Paths ---
# !!! EDIT THESE PATHS !!!
train_zip_path = '/content/drive/MyDrive/11785FinalData/train.zip'
val_zip_path = '/content/drive/MyDrive/11785FinalData/val.zip'
test_zip_path = '/content/drive/MyDrive/11785FinalData/test.zip'

# 1. Load data into NumPy arrays
print("--- Loading Training Data ---")
X_train_ppg, y_train_ecg = load_and_process(train_zip_path, 'data/train', seq_length=SEQ_LENGTH)
print("--- Loading Test Data ---")
X_test_ppg, y_test_ecg = load_and_process(test_zip_path, 'data/test', seq_length=SEQ_LENGTH)

if X_train_ppg.shape[0] > 0:
    # 2. Create tf.data.Dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_ppg, y_train_ecg))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test_ppg, y_test_ecg))

    # 3. Optimize pipeline
    train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    print("\n--- tf.data pipeline created ---")
    print(f"Train Dataset: {train_dataset}")
    print(f"Test Dataset: {test_dataset}")
else:
    print("--- ERROR: No training data was loaded. Aborting. ---")

Here we define the four models required for a CycleGAN:

Generator (U-Net): A 1D U-Net architecture, similar to before. It uses InstanceNormalization instead of BatchNormalization. We will use the same build_generator function for both G: PPG->ECG and F: ECG->PPG.

Discriminator (PatchGAN): A 1D CNN that acts as a classifier. It doesn't output a single "real/fake" (0/1). Instead, it outputs a sequence of predictions (e.g., (30, 1)). This "PatchGAN" approach is more stable. We will use build_discriminator for both D_X (PPG) and D_Y (ECG).

In [None]:
# === 4. Model Architecture ===

# Use InstanceNormalization
InstanceNorm = tfa.layers.InstanceNormalization
OUTPUT_CHANNELS = 1
INPUT_SHAPE = (SEQ_LENGTH, 1)

# Weight initializer for GANs
init = RandomNormal(stddev=0.02)

def downsample(filters, size, apply_instancenorm=True):
    """A 1D downsampling block (Conv -> InstanceNorm -> LeakyReLU)"""
    result = tf.keras.Sequential()
    result.add(Conv1D(filters, size, strides=2, padding='same', kernel_initializer=init, use_bias=False))
    if apply_instancenorm:
        result.add(InstanceNorm())
    result.add(LeakyReLU())
    return result

def upsample(filters, size, apply_dropout=False):
    """A 1D upsampling block (ConvTranspose -> InstanceNorm -> (Dropout) -> ReLU)"""
    result = tf.keras.Sequential()
    result.add(Conv1DTranspose(filters, size, strides=2, padding='same', kernel_initializer=init, use_bias=False))
    result.add(InstanceNorm())
    if apply_dropout:
        result.add(Dropout(0.5))
    result.add(LeakyReLU())
    return result

def build_generator(input_shape=INPUT_SHAPE, name="generator"):
    """Builds the 1D U-Net Generator"""
    inputs = Input(shape=input_shape)

    # Downsampling path
    d1 = downsample(64, 4, apply_instancenorm=False)(inputs) # (128, 64)
    d2 = downsample(128, 4)(d1) # (64, 128)
    d3 = downsample(256, 4)(d2) # (32, 256)
    d4 = downsample(512, 4)(d3) # (16, 512)

    # Bottleneck
    bottleneck = Conv1D(512, 4, strides=2, padding='same', kernel_initializer=init)(d4) # (8, 512)
    bottleneck = LeakyReLU()(bottleneck)

    # Upsampling path
    u4 = upsample(512, 4)(bottleneck) # (16, 512)
    u4 = Concatenate()([u4, d4]) # Skip connection

    u3 = upsample(256, 4)(u4) # (32, 256)
    u3 = Concatenate()([u3, d3]) # Skip connection

    u2 = upsample(128, 4)(u3) # (64, 128)
    u2 = Concatenate()([u2, d2]) # Skip connection

    u1 = upsample(64, 4)(u2) # (128, 64)
    u1 = Concatenate()([u1, d1]) # Skip connection

    # Final layer
    last = Conv1DTranspose(OUTPUT_CHANNELS, 4, strides=2, padding='same', kernel_initializer=init, activation='tanh')(u1) # (256, 1)

    return Model(inputs, last, name=name)

def build_discriminator(input_shape=INPUT_SHAPE, name="discriminator"):
    """Builds the 1D PatchGAN Discriminator"""
    inputs = Input(shape=input_shape)

    d1 = Conv1D(64, 4, strides=2, padding='same', kernel_initializer=init)(inputs) # (128, 64)
    d1 = LeakyReLU(0.2)(d1)

    d2 = Conv1D(128, 4, strides=2, padding='same', kernel_initializer=init, use_bias=False)(d1) # (64, 128)
    d2 = InstanceNorm()(d2)
    d2 = LeakyReLU(0.2)(d2)

    d3 = Conv1D(256, 4, strides=2, padding='same', kernel_initializer=init, use_bias=False)(d2) # (32, 256)
    d3 = InstanceNorm()(d3)
    d3 = LeakyReLU(0.2)(d3)

    # Patch output
    patch_out = Conv1D(1, 4, strides=1, padding='same', kernel_initializer=init)(d3) # (32, 1)

    return Model(inputs, patch_out, name=name)

This is the most complex part of the CycleGAN. We define:

Four Optimizers: One for each of our four networks.

Loss Objects: BinaryCrossentropy for the adversarial (GAN) loss and MeanAbsoluteError (L1) for the cycle and identity losses.

Loss Functions:

discriminator_loss: Tries to make the discriminator output 1 for real signals and 0 for fake signals.

generator_loss: Tries to make the discriminator output 1 for the generator's fake signals.

calc_cycle_loss: The L1 error between a real signal and the reconstructed signal.

calc_identity_loss: (Optional but good) The L1 error when a generator receives a signal from its target domain (e.g., the PPG->ECG generator gets a real ECG). It should learn to do nothing.

In [None]:
# === 5. Loss Functions & Optimizers ===

# --- Loss Weights ---
# Cycle loss controls reconstruction
LAMBDA_CYCLE = 10.0
# Identity loss helps preserve color/tone (or in our case, signal characteristics)
LAMBDA_IDENTITY = 0.5 * LAMBDA_CYCLE

# --- Optimizers ---
# We need four separate optimizers
generator_g_optimizer = Adam(2e-4, beta_1=0.5) # G: PPG -> ECG
generator_f_optimizer = Adam(2e-4, beta_1=0.5) # F: ECG -> PPG
discriminator_x_optimizer = Adam(2e-4, beta_1=0.5) # D_X: Distinguishes real/fake PPG
discriminator_y_optimizer = Adam(2e-4, beta_1=0.5) # D_Y: Distinguishes real/fake ECG

# --- Loss Objects ---
loss_obj = tf.keras.losses.BinaryCrossentropy(from_logits=True)
L1_loss = tf.keras.losses.MeanAbsoluteError()

def discriminator_loss(real, fake):
    real_loss = loss_obj(tf.ones_like(real), real)
    fake_loss = loss_obj(tf.zeros_like(fake), fake)
    return (real_loss + fake_loss) * 0.5

def generator_loss(generated):
    # The generator wants the discriminator to believe its output is real
    return loss_obj(tf.ones_like(generated), generated)

def calc_cycle_loss(real_image, cycled_image):
    loss = L1_loss(real_image, cycled_image)
    return LAMBDA_CYCLE * loss

def calc_identity_loss(real_image, same_image):
    loss = L1_loss(real_image, same_image)
    return LAMBDA_IDENTITY * loss

This block instantiates all four of our models and creates Checkpoint objects to save our progress during training.

In [None]:
# === 6. Build Models & Checkpoints ===

if 'train_dataset' in locals():
    # --- Instantiate Models ---
    # G: PPG -> ECG
    generator_g = build_generator(name="generator_g")
    # F: ECG -> PPG
    generator_f = build_generator(name="generator_f")
    # D_X: Discriminator for PPG
    discriminator_x = build_discriminator(name="discriminator_x")
    # D_Y: Discriminator for ECG
    discriminator_y = build_discriminator(name="discriminator_y")

    print("--- Models Built ---")

    # --- Checkpoint Saver ---
    checkpoint_path = "./checkpoints/train"
    ckpt = tf.train.Checkpoint(generator_g=generator_g,
                               generator_f=generator_f,
                               discriminator_x=discriminator_x,
                               discriminator_y=discriminator_y,
                               generator_g_optimizer=generator_g_optimizer,
                               generator_f_optimizer=generator_f_optimizer,
                               discriminator_x_optimizer=discriminator_x_optimizer,
                               discriminator_y_optimizer=discriminator_y_optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

    print(f"Checkpoints will be saved to {checkpoint_path}")
else:
    print("Skipping Section 6: train_dataset not found.")

This is the most important function. We wrap it in @tf.function to compile it into a high-performance TensorFlow graph.

This function performs one full step of training for all four networks:

Forward Pass: Generates fake signals, cycled signals, and identity signals.

Loss Calculation: Calculates all losses (GAN, cycle, and identity) for both generators.

Discriminator Loss: Calculates the losses for both discriminators.

Gradients: Calculates the gradients for all four networks based on their respective losses.

Apply Gradients: Applies the gradients to update the weights of all four networks using their optimizers.

In [None]:
# === 7. The Custom train_step ===

@tf.function
def train_step(real_x, real_y):
    # real_x is PPG, real_y is ECG
    with tf.GradientTape(persistent=True) as tape:

        # --- Generator G (PPG -> ECG) ---
        fake_y = generator_g(real_x, training=True)
        cycled_x = generator_f(fake_y, training=True)

        # --- Generator F (ECG -> PPG) ---
        fake_x = generator_f(real_y, training=True)
        cycled_y = generator_g(fake_x, training=True)

        # --- Identity mapping ---
        # G should not change an ECG signal
        same_y = generator_g(real_y, training=True)
        # F should not change a PPG signal
        same_x = generator_f(real_x, training=True)

        # --- Discriminator decisions ---
        disc_real_x = discriminator_x(real_x, training=True) # D_X on real PPG
        disc_real_y = discriminator_y(real_y, training=True) # D_Y on real ECG
        disc_fake_x = discriminator_x(fake_x, training=True) # D_X on fake PPG
        disc_fake_y = discriminator_y(fake_y, training=True) # D_Y on fake ECG

        # --- Generator Losses ---
        gen_g_loss = generator_loss(disc_fake_y) # G wants D_Y to think fake_y is real
        gen_f_loss = generator_loss(disc_fake_x) # F wants D_X to think fake_x is real

        # --- Cycle Losses ---
        total_cycle_loss = calc_cycle_loss(real_x, cycled_x) + calc_cycle_loss(real_y, cycled_y)

        # --- Total Generator Loss ---
        total_gen_g_loss = gen_g_loss + total_cycle_loss + calc_identity_loss(real_y, same_y)
        total_gen_f_loss = gen_f_loss + total_cycle_loss + calc_identity_loss(real_x, same_x)

        # --- Discriminator Losses ---
        disc_x_loss = discriminator_loss(disc_real_x, disc_fake_x)
        disc_y_loss = discriminator_loss(disc_real_y, disc_fake_y)

    # --- Calculate Gradients ---
    generator_g_gradients = tape.gradient(total_gen_g_loss, generator_g.trainable_variables)
    generator_f_gradients = tape.gradient(total_gen_f_loss, generator_f.trainable_variables)
    discriminator_x_gradients = tape.gradient(disc_x_loss, discriminator_x.trainable_variables)
    discriminator_y_gradients = tape.gradient(disc_y_loss, discriminator_y.trainable_variables)

    # Apply gradients
    generator_g_optimizer.apply_gradients(zip(generator_g_gradients, generator_g.trainable_variables))
    generator_f_optimizer.apply_gradients(zip(generator_f_gradients, generator_f.trainable_variables))
    discriminator_x_optimizer.apply_gradients(zip(discriminator_x_gradients, discriminator_x.trainable_variables))
    discriminator_y_optimizer.apply_gradients(zip(discriminator_y_gradients, discriminator_y.trainable_variables))

    return total_gen_g_loss, total_gen_f_loss, disc_x_loss, disc_y_loss, total_cycle_loss

print("--- train_step function defined ---")

This is the main loop where we run the training. It iterates for a set number of epochs.

Warning: GANs take a long time to train. 20 epochs might take several hours. You may want to start with 1 or 2 epochs to ensure it works.

We iterate through our train_dataset, calling train_step for each batch.

We print the losses every 100 steps.

At the end of each epoch, we save a checkpoint.

In [None]:
# === 8. The Training Loop ===

if 'train_dataset' in locals():
    print(f"--- Starting Training for {EPOCHS} epochs ---")
    print(f"Batch size: {BATCH_SIZE}, Steps per epoch: {len(X_train_ppg) // BATCH_SIZE}")

    for epoch in range(EPOCHS):
        start = time.time()
        print(f"Epoch {epoch + 1}/{EPOCHS}")

        n = 0
        for real_ppg, real_ecg in tqdm(train_dataset, desc="Epoch Progress"):
            # Run the training step
            g_loss, f_loss, dx_loss, dy_loss, cycle_loss = train_step(real_ppg, real_ecg)

            if n % 200 == 0:
                print(f"  Step {n}: G_loss={g_loss:.4f}, F_loss={f_loss:.4f}, D_X_loss={dx_loss:.4f}, D_Y_loss={dy_loss:.4f}, Cycle_loss={cycle_loss:.4f}")
            n += 1

        # Save checkpoint at the end of the epoch
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
        print(f'Time for epoch {epoch + 1} is {time.time()-start:.2f} sec\n')

    print("--- Training Finished ---")
else:
    print("Skipping Section 8: train_dataset not found.")

After training, you can use this block to evaluate your model.

We run the generator_g (PPG -> ECG) on the entire test set.

We calculate the Mean Absolute Error (MAE) and Mean Squared Error (MSE) between the generated ECG and the real ECG.

This gives you the final quantitative metrics for your report.

(You can also add code here to plot a few examples using matplotlib.)

In [None]:
# === 9. Evaluation & Inference ===

if 'test_dataset' in locals():
    print("--- Evaluating model on Test Set ---")

    all_maes = []
    all_mses = []

    # Iterate through the test dataset
    for test_ppg, test_ecg in tqdm(test_dataset, desc="Evaluating"):
        # Generate a fake ECG from the test PPG
        # training=False is important
        pred_ecg = generator_g(test_ppg, training=False)

        # Calculate errors
        mae = L1_loss(test_ecg, pred_ecg)
        mse = tf.keras.losses.MeanSquaredError()(test_ecg, pred_ecg)

        all_maes.append(mae.numpy())
        all_mses.append(mse.numpy())

    # --- Report Final Metrics ---
    final_mae = np.mean(all_maes)
    final_mse = np.mean(all_mses)
    final_rmse = np.sqrt(final_mse)

    print("\n--- CycleGAN Model Test Results ---")
    print(f"Test Set MAE:   {final_mae:.4f}")
    print(f"Test Set MSE:   {final_mse:.4f}")
    print(f"Test Set RMSE:  {final_rmse:.4f}")
    print("-----------------------------------")

    # You can add plotting code here
    # import matplotlib.pyplot as plt
    # ...

else:
    print("Skipping Section 9: test_dataset not found.")