# BiTimelyGPT for PhysioNet 2012 Challenge

This Google Colab notebook implements BiTimelyGPT for the PhysioNet 2012 Challenge of predicting in-hospital mortality.

In [None]:
# import libraries
import os
import torch
import numpy as np
import pandas as pd
from torch import nn, optim
from sklearn.metrics import roc_auc_score, average_precision_score


In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# BiTimelyGPT to path
import sys
sys.path.append('/content/drive/MyDrive/BiTimelyGPT-main/BiTimelyGPT')

Mounted at /content/drive


In [None]:
# import BiTimelyGPT modules
from data.data_pipeline import physionet_data_pipeline
from models.BiTimelyGPT import BiTimelyGPT
from layers.optimization import get_linear_schedule_with_warmup, AdamW

In [None]:
# define paths to PhysioNet 2012 data in Google Drive
google_drive_folder = '/content/drive/MyDrive/BiTimelyGPT-main/'
set_a_directory = f"{google_drive_folder}/set-a"
set_b_directory = f"{google_drive_folder}/set-b"
outcomes_a_file = f"{set_a_directory}/Outcomes-a.txt"
outcomes_b_file = f"{set_b_directory}/Outcomes-b.txt"

In [None]:
# define PhysioNet configuration
class PhysioNetConfig:
    def __init__(self):
        # define standard features from PhysioNet
        self.feature_list = [
            'Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol',
            'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT',
            'HR', 'K', 'Lactate', 'Mg', 'MAP', 'MechVent', 'Na', 'NIDiasABP',
            'NIMAP', 'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate',
            'SaO2', 'SysABP', 'Temp', 'TropI', 'TropT', 'Urine', 'WBC', 'Weight'
        ]

        #model parameters
        self.num_layers = 4
        self.num_heads = 4
        self.d_model = 144
        self.qk_dim = 144
        self.v_dim = 288
        self.ffn_proj_size = 576
        self.d_ff = 288
        self.dropout = 0.1
        self.n_output = len(self.feature_list)

        # training parameters
        self.batch_size = 32
        self.learning_rate = 3e-4
        self.num_epochs = 50
        self.warmup_steps = 1000
        self.gradient_clip = 1.0

        # BiTimelyGPT specific
        self.use_bias_in_msr = False
        self.use_bias_in_mlp = True
        self.use_bias_in_msr_out = False
        self.use_default_gamma = False
        self.forward_impl = 'chunkwise'
        self.chunk_size = 12
        self.seq_len = 2880  # 48 hours * 60 minutes
        self.chunk_size = 60  # process in 1-hour chunks
        self.activation = 'gelu'
        self.head_type = 'clf'  # using (binary) classification head

        # GPU and optimization settings
        self.use_gpu = torch.cuda.is_available()
        self.use_grad_ckp = False
        self.use_grad_accum = True
        self.accum_steps = 4
        self.use_amp = False
        self.use_multi_gpu = False
        self.devices = '0'

        self.output_retentions = False

# init config
config = PhysioNetConfig()


In [None]:

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# random seeds
torch.manual_seed(42)
np.random.seed(42)

# init model
model = BiTimelyGPT(configs=config, head_type='clf').to(device)


In [None]:

# load training data
train_data_path = set_a_directory
train_outcomes_path = outcomes_a_file
train_dataset, train_loader = physionet_data_pipeline(
    train_data_path,
    train_outcomes_path,
    config,
    split='train'
)

# load validation data
val_data_path = set_b_directory
val_outcomes_path = outcomes_b_file
val_dataset, val_loader = physionet_data_pipeline(
    val_data_path,
    val_outcomes_path,
    config,
    split='val'
)

# training setup
optimizer = AdamW(
    model.parameters(),
    lr=config.learning_rate,
    weight_decay=0.01
)

# calculate total steps for scheduler
num_training_steps = len(train_loader) * config.num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    config.warmup_steps,
    num_training_steps
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Time'] = df['Time'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Time'] = df['Time'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Time'] = df['Time'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

In [None]:


# save paths to folders in Google Drive
save_paths = {
    "train_dataset": "/content/drive/My Drive/train_dataset_v3.pt",
    "train_loader": "/content/drive/My Drive/train_loader_v3.pt",
    "val_dataset": "/content/drive/My Drive/val_dataset_v3.pt",
    "val_loader": "/content/drive/My Drive/val_loader_v3.pt",
    "optimizer": "/content/drive/My Drive/optimizer_state_v3.pt",
    "scheduler": "/content/drive/My Drive/scheduler_state_v3.pt",
}


In [None]:

# save datasets and data loaders
torch.save(train_dataset, save_paths["train_dataset"])
torch.save(val_dataset, save_paths["val_dataset"])

# save optimizer and scheduler states
torch.save(optimizer.state_dict(), save_paths["optimizer"])
torch.save(scheduler.state_dict(), save_paths["scheduler"])


In [None]:
from torch.utils.data import DataLoader
from data.data_pipeline import custom_collate

train_dataset = torch.load(save_paths["train_dataset"], weights_only=False)
val_dataset   = torch.load(save_paths["val_dataset"], weights_only=False)

# reconstruct DataLoaders (using custom_collate)
train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    collate_fn=custom_collate
)
val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    collate_fn=custom_collate
)

# training setup
optimizer = AdamW(
    model.parameters(),
    lr=config.learning_rate,
    weight_decay=0.01
)

# calculate total steps for scheduler
num_training_steps = len(train_loader) * config.num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    config.warmup_steps,
    num_training_steps
)

# reload optimizer and scheduler states
optimizer.load_state_dict(torch.load(save_paths["optimizer"]))
scheduler.load_state_dict(torch.load(save_paths["scheduler"]))

In [None]:
# save paths for best model
best_model_path = os.path.join("/content/drive/My Drive/", "best_model_v3.pt")

criterion = nn.CrossEntropyLoss()  # loss function

# init training metrics
best_val_loss = float('inf')
best_val_acc = 0.0

for epoch in range(config.num_epochs):
    # ------------------
    # Training Phase
    # ------------------
    model.train()
    train_losses = []
    train_accs = []

    for i, batch_data in enumerate(train_loader):
        # unpack batch data including attention mask
        batch_x, batch_y, attention_mask = [b.to(device) for b in batch_data]

        # clear gradients every "accum_steps" steps
        if i % config.accum_steps == 0:
            optimizer.zero_grad()

        # forward pass through layers
        hidden_states = model.conv_subsampling(batch_x)[0]
        hidden_states = model.input_projection(hidden_states)

        # conv layer reduces sequence length by a factor of 4 so we subsample the mask accordingly (stride=4)
        # note: fix this later as this is just and approximation
        if attention_mask.shape[1] > hidden_states.shape[1]:
            # take every 4th position (this approximates the conv since conv saves roughly 1/4 length)
            attention_mask = attention_mask[:, ::4]
            # else trim to match exact length after conv
            attention_mask = attention_mask[:, :hidden_states.shape[1]]

        for l, block in enumerate(model.blocks):
            block_outputs = block(
                hidden_states,
                retention_mask=attention_mask,  # pass the mask to each block
                forward_impl=config.forward_impl,
                chunk_size=config.chunk_size
            )
            hidden_states = block_outputs[0]
            if (l + 1) == model.n_layers:
                hidden_states_PTP = hidden_states

        X = model.ln_f(hidden_states_PTP[:, 0, :])
        X = X.unsqueeze(1)
        logits = model.head(X)

        # compute loss and accuracy
        raw_loss = criterion(logits, batch_y)
        with torch.no_grad():
            accuracy = model.compute_cls_loss(logits, batch_y)

        # scale loss and backprop
        scaled_loss = raw_loss / config.accum_steps
        scaled_loss.backward()

        # update weights when required
        if (i + 1) % config.accum_steps == 0 or (i + 1) == len(train_loader):
            optimizer.step()
            scheduler.step()

        train_losses.append(raw_loss.item())
        train_accs.append(accuracy.item())

    avg_train_loss = np.mean(train_losses)
    avg_train_acc = np.mean(train_accs)
    print(f'Epoch {epoch+1}/{config.num_epochs} -- Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_acc:.2f}%')

    # ------------------
    # Validation Phase
    # ------------------
    model.eval()
    val_losses = []
    val_accs = []

    with torch.no_grad():
        for i, batch_data in enumerate(val_loader):
            batch_x, batch_y, attention_mask = [b.to(device) for b in batch_data]

            # forward pass
            hidden_states = model.conv_subsampling(batch_x)[0]
            hidden_states = model.input_projection(hidden_states)

            # adjust attention mask to match hidden states as above
            if attention_mask.shape[1] > hidden_states.shape[1]:
                attention_mask = attention_mask[:, ::4]
                attention_mask = attention_mask[:, :hidden_states.shape[1]]

            for l, block in enumerate(model.blocks):
                block_outputs = block(
                    hidden_states,
                    retention_mask=attention_mask,
                    forward_impl=config.forward_impl,
                    chunk_size=config.chunk_size
                )
                hidden_states = block_outputs[0]
                if (l + 1) == model.n_layers:
                    hidden_states_PTP = hidden_states

            X_val = model.ln_f(hidden_states_PTP[:, 0, :])
            X_val = X_val.unsqueeze(1)
            logits_val = model.head(X_val)

            loss_val = criterion(logits_val, batch_y)
            accuracy_val = model.compute_cls_loss(logits_val, batch_y)

            val_losses.append(loss_val.item())
            val_accs.append(accuracy_val.item())

    avg_val_loss = np.mean(val_losses)
    avg_val_acc = np.mean(val_accs)
    print(f'Epoch {epoch+1}/{config.num_epochs} -- Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_acc:.2f}%')

    # save best model
    if epoch >= 5 and avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_val_acc = avg_val_acc
        torch.save(model.state_dict(), best_model_path)
        print("==> Best model updated/saved based on validation loss.\n")

Epoch 1/50 -- Train Loss: 3.5769, Train Accuracy: 1.93%
Epoch 1/50 -- Validation Loss: 3.2507, Validation Accuracy: 7.83%
Epoch 2/50 -- Train Loss: 2.8193, Train Accuracy: 39.10%
Epoch 2/50 -- Validation Loss: 2.0575, Validation Accuracy: 80.42%
Epoch 3/50 -- Train Loss: 1.5997, Train Accuracy: 83.28%
Epoch 3/50 -- Validation Loss: 0.9912, Validation Accuracy: 85.67%
Epoch 4/50 -- Train Loss: 0.8766, Train Accuracy: 85.95%
Epoch 4/50 -- Validation Loss: 0.6776, Validation Accuracy: 85.80%
Epoch 5/50 -- Train Loss: 0.6359, Train Accuracy: 86.08%
Epoch 5/50 -- Validation Loss: 0.5441, Validation Accuracy: 85.80%
Epoch 6/50 -- Train Loss: 0.5319, Train Accuracy: 86.15%
Epoch 6/50 -- Validation Loss: 0.4932, Validation Accuracy: 85.80%
==> Best model updated (saved to best_model.pt) based on validation loss.

Epoch 7/50 -- Train Loss: 0.4832, Train Accuracy: 86.15%
Epoch 7/50 -- Validation Loss: 0.4670, Validation Accuracy: 85.80%
==> Best model updated (saved to best_model.pt) based on va

In [None]:
# load the best model for evaluation

if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))
    model.eval()
    print("Best model loaded from", best_model_path)
else:
    print("No best model found at", best_model_path)

TODO: insert evaluation metrics, ROC curve, calibration plot