In [1]:
# install PyPOTS
!pip install pypots==0.8.1 # note: broken in current pypots=

from google.colab import drive
import os

import pandas as pd


Collecting pypots==0.8.1
  Downloading pypots-0.8.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting tsdb>=0.6.1 (from pypots==0.8.1)
  Downloading tsdb-0.7.1-py3-none-any.whl.metadata (13 kB)
Collecting pygrinder>=0.6.4 (from pypots==0.8.1)
  Downloading pygrinder-0.7-py3-none-any.whl.metadata (10 kB)
Collecting benchpots>=0.3 (from pypots==0.8.1)
  Downloading benchpots-0.3.2-py3-none-any.whl.metadata (9.5 kB)
Collecting ai4ts (from pypots==0.8.1)
  Downloading ai4ts-0.0.3-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10.0->pypots==0.8.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10.0->pypots==0.8

In [2]:
# mount Google Drive
drive.mount('/content/drive')

# paths to PhysioNet 2012 data in Google Drive
google_drive_folder = '/content/drive/MyDrive/BiTimelyGPT-main/'
set_a_directory = f"{google_drive_folder}/set-a"
set_b_directory = f"{google_drive_folder}/set-b"
outcomes_a_file = f"{set_a_directory}/Outcomes-a.txt"
outcomes_b_file = f"{set_b_directory}/Outcomes-b.txt"

print("Set A Directory:", set_a_directory)
print("Set B Directory:", set_b_directory)
print("Outcomes A File:", outcomes_a_file)
print("Outcomes B File:", outcomes_b_file)

Mounted at /content/drive
Set A Directory: /content/drive/MyDrive/BiTimelyGPT-main//set-a
Set B Directory: /content/drive/MyDrive/BiTimelyGPT-main//set-b
Outcomes A File: /content/drive/MyDrive/BiTimelyGPT-main//set-a/Outcomes-a.txt
Outcomes B File: /content/drive/MyDrive/BiTimelyGPT-main//set-b/Outcomes-b.txt


In [3]:
# load outcomes
outcomes_a = pd.read_csv(outcomes_a_file)
outcomes_b = pd.read_csv(outcomes_b_file)

# inspect data head
print("Outcomes-a:")
print(outcomes_a.head())
print("\nOutcomes-b:")
print(outcomes_b.head())

Outcomes-a:
   RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death
0    132539       6     1               5        -1                  0
1    132540      16     8               8        -1                  0
2    132541      21    11              19        -1                  0
3    132543       7     1               9       575                  0
4    132545      17     2               4       918                  0

Outcomes-b:
   RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death
0    142675      27    14               9         7                  1
1    142676      12     1              31       468                  0
2    142680      12     7              17        16                  1
3    142683      19    15              17        -1                  0
4    142688       3     0               9        -1                  0


In [4]:
# add a 'set' column (so we can distinguish between set-a and set-b)
outcomes_a['set'] = 'a'
outcomes_b['set'] = 'b'

# combine outcomes into a single df
combined_outcomes = pd.concat([outcomes_a, outcomes_b], ignore_index=True)

#verify object
print("\nCombined Outcomes:")
print(combined_outcomes.head())


Combined Outcomes:
   RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death set
0    132539       6     1               5        -1                  0   a
1    132540      16     8               8        -1                  0   a
2    132541      21    11              19        -1                  0   a
3    132543       7     1               9       575                  0   a
4    132545      17     2               4       918                  0   a


In [5]:
# for binary classification we use 'In-hospital_death' as the target
# remove records with 'In-hospital_death' == -1
filtered_outcomes = combined_outcomes[combined_outcomes['In-hospital_death'] != -1].reset_index(drop=True)

# verify filtering
print("\nFiltered Outcomes (no -1 in 'In-hospital_death'):")
print(filtered_outcomes.head())
print("\nNumber of records after filtering:", len(filtered_outcomes))


Filtered Outcomes (no -1 in 'In-hospital_death'):
   RecordID  SAPS-I  SOFA  Length_of_stay  Survival  In-hospital_death set
0    132539       6     1               5        -1                  0   a
1    132540      16     8               8        -1                  0   a
2    132541      21    11              19        -1                  0   a
3    132543       7     1               9       575                  0   a
4    132545      17     2               4       918                  0   a

Number of records after filtering: 8000


In [6]:
def parse_patient_file(file_path):
    """
    Parses an individual patient file and returns a DataFrame with time steps and features.

    Args:
        file_path (str): Path to the patient file.

    Returns:
        pd.DataFrame: DataFrame containing time steps and features.
    """
    # read data file
    df = pd.read_csv(file_path)

    # init dictionaries for static and dynamic features
    static_features = {}
    dynamic_features = {}

    # iterate over rows
    for index, row in df.iterrows():
        time = row['Time']
        param = row['Parameter']
        value = row['Value']

        # convert time to minutes
        if ':' in time:
            hours, minutes = map(int, time.split(':'))
            total_minutes = hours * 60 + minutes
        else:
            try:
                total_minutes = float(time)
            except:
                total_minutes = 0  # this sets default to 0 if conversion fails

        # static features are at Time == 0
        if total_minutes == 0:
            static_features[param] = value
        else:
            if total_minutes not in dynamic_features:
                dynamic_features[total_minutes] = {}
            dynamic_features[total_minutes][param] = value

    # convert dynamic_features to df
    if dynamic_features:
        dynamic_df = pd.DataFrame.from_dict(dynamic_features, orient='index').sort_index()
        dynamic_df.index.name = 'Time'
        dynamic_df.reset_index(inplace=True)
    else:
        dynamic_df = pd.DataFrame(columns=['Time'])

    # add static features to df as columns
    for key, val in static_features.items():
        dynamic_df[key] = val

    return dynamic_df

In [7]:
from tqdm import tqdm

def load_all_patients(set_path, outcomes_df):
    """
    Loads and parses all patient files in a given set (set-a or set-b).

    Args:
        set_path (str): Path to the set directory.
        outcomes_df (pd.DataFrame): DataFrame containing outcomes for the set.

    Returns:
        list: List of patient DataFrames with features and time steps.
        list: List of corresponding labels.
    """
    patient_data = []
    labels = []

    # get list of per-patient files (excluding outcomes in Outcomes-a.txt and Outcomes-b.txt)
    patient_files = [f for f in os.listdir(set_path) if f.endswith('.txt') and not f.startswith('Outcomes')]

    for patient_file in tqdm(patient_files, desc=f"Loading patients from {set_path}"):
        try:
            record_id = int(patient_file.replace('.txt', ''))
        except:
            continue  # this skips files that don't have numeric RecordID

        file_path = os.path.join(set_path, patient_file)

        # parse patient file
        patient_df = parse_patient_file(file_path)

        # add RecordID for merge
        patient_df['RecordID'] = record_id

        # merge w/ outcomes to get the label
        outcome = outcomes_df[outcomes_df['RecordID'] == record_id]
        if outcome.empty:
            continue  # this skips if no outcome found
        label = outcome['In-hospital_death'].values[0]
        patient_data.append(patient_df)
        labels.append(label)

    return patient_data, labels

# load set-a (training)
train_data, train_labels = load_all_patients(set_a_directory, outcomes_a)

# load set-b (validation)
val_data, val_labels = load_all_patients(set_b_directory, outcomes_b)

# verify data opperations
print(f"\nNumber of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(val_data)}")

Loading patients from /content/drive/MyDrive/BiTimelyGPT-main//set-a: 100%|██████████| 4000/4000 [05:00<00:00, 13.30it/s]
Loading patients from /content/drive/MyDrive/BiTimelyGPT-main//set-b: 100%|██████████| 4000/4000 [05:05<00:00, 13.09it/s]


Number of training samples: 4000
Number of validation samples: 4000





In [8]:
# identify static features (Physionet 2012)
static_features = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight']

# dynamic features are all columns except 'Time' and static features
dynamic_features = ['Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol',
            'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT',
            'HR', 'K', 'Lactate', 'Mg', 'MAP', 'MechVent', 'Na', 'NIDiasABP',
            'NIMAP', 'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate',
            'SaO2', 'SysABP', 'Temp', 'TropI', 'TropT', 'Urine', 'WBC', 'Weight'
        ]

# verify static and dynamic variables
print("Static Features:", static_features)
print("Dynamic Features:", dynamic_features)

Static Features: ['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight']
Dynamic Features: ['Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'Mg', 'MAP', 'MechVent', 'Na', 'NIDiasABP', 'NIMAP', 'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'TropI', 'TropT', 'Urine', 'WBC', 'Weight']


In [9]:
import numpy as np
from tqdm import tqdm

def create_sequences(patient_data, static_features, dynamic_features, labels, max_time_steps=None):
    """
    Creates sequences for GRU-D from individual patient data.

    Args:
        patient_data (list): List of patient DataFrames.
        static_features (list): List of static feature names.
        dynamic_features (list): List of dynamic feature names.
        labels (list): List of labels corresponding to each patient.
        max_time_steps (int, optional): Maximum number of time steps. If None, use the maximum length.

    Returns:
        dict: Dictionary containing 'X', 'mask', 'delta', and 'y'.
    """
    X = []
    mask = []
    delta = []
    y = []

    num_features = len(dynamic_features)

    for df, label in tqdm(zip(patient_data, labels), total=len(patient_data), desc="Creating sequences"):
        # sort df by time
        df = df.sort_values('Time').reset_index(drop=True)

        # check if static features are present
        if df[static_features].isnull().all().all():
            continue  # this skips patient if all static features are missing

        # compute time differences (delta)
        time = df['Time'].values
        delta_time = np.diff(time, prepend=0)
        delta_time = np.where(delta_time == 0, 1, delta_time)  # avoid zero delta, i.e. no duplicate measurments

        # repeat delta_time for each feature to match (time_steps, num_features)
        delta_repeated = np.tile(delta_time.reshape(-1, 1), (1, num_features))

        # extract dynamic features, ensure all dynamic_features are present
        dynamic = df.reindex(columns=dynamic_features).values.astype(float)  # note shape is: (time_steps, num_features)

        # missing values: assuming missing if not present in the record
        # create mask where 1 indicates observed and 0 indicates missing
        dynamic_mask = ~np.isnan(dynamic)
        dynamic_mask = dynamic_mask.astype(float)

        # fill missing values with zero (note: PyPOTS GRU-D expects missing values to be filled)
        dynamic_filled = np.nan_to_num(dynamic, nan=0.0)

        # append to lists
        X.append(dynamic_filled)
        mask.append(dynamic_mask)
        delta.append(delta_repeated)

        # append label
        y.append(label)

    # determine maximum time steps (need this to limit memory issues for now)
    if not max_time_steps:
        max_time_steps = max([seq.shape[0] for seq in X])

    # pad sequences
    num_features = len(dynamic_features)

    X_padded = np.zeros((len(X), max_time_steps, num_features))
    mask_padded = np.zeros((len(mask), max_time_steps, num_features))
    delta_padded = np.zeros((len(delta), max_time_steps, num_features))
    y_array = np.array(y)

    for i in range(len(X)):
        seq_len = X[i].shape[0]
        if seq_len > max_time_steps:
            # truncate sequences longer than max_time_steps
            X_padded[i, :max_time_steps, :] = X[i][:max_time_steps, :]
            mask_padded[i, :max_time_steps, :] = mask[i][:max_time_steps, :]
            delta_padded[i, :max_time_steps, :] = delta[i][:max_time_steps, :]
        else:
            # pad sequences shorter than max_time_steps
            X_padded[i, :seq_len, :] = X[i]
            mask_padded[i, :seq_len, :] = mask[i]
            delta_padded[i, :seq_len, :] = delta[i]

    return {
        'X': X_padded,
        'mask': mask_padded,
        'delta': delta_padded,
        'y': y_array
    }

# create sequences for training data
train_sequences = create_sequences(train_data, static_features, dynamic_features, train_labels, max_time_steps=100)

# create sequences for validation data
val_sequences = create_sequences(val_data, static_features, dynamic_features, val_labels, max_time_steps=100)

# ensure correct dims
print("\nTraining Data Shapes:")
for key in train_sequences:
    print(f"{key}: {train_sequences[key].shape}")

print("\nValidation Data Shapes:")
for key in val_sequences:
    print(f"{key}: {val_sequences[key].shape}")

Creating sequences: 100%|██████████| 4000/4000 [00:10<00:00, 370.99it/s]
Creating sequences: 100%|██████████| 4000/4000 [00:09<00:00, 442.80it/s]



Training Data Shapes:
X: (3997, 100, 37)
mask: (3997, 100, 37)
delta: (3997, 100, 37)
y: (3997,)

Validation Data Shapes:
X: (3993, 100, 37)
mask: (3993, 100, 37)
delta: (3993, 100, 37)
y: (3993,)


In [10]:
# verify that labels are binary
unique_train_labels = np.unique(train_sequences['y'])
unique_val_labels = np.unique(val_sequences['y'])

print("Unique labels in training set:", unique_train_labels)
print("Unique labels in validation set:", unique_val_labels)

# function to force binary labeling if extrenous values found
if set(unique_train_labels) == {-1, 1}:
    train_sequences['y'] = (train_sequences['y'] == 1).astype(int)
    val_sequences['y'] = (val_sequences['y'] == 1).astype(int)
    print("Mapped labels from {-1, 1} to {0, 1}")
elif set(unique_train_labels) == {0, 1}:
    print("Labels are already in binary format (0 and 1)")
else:
    raise ValueError("Unexpected label values. Please ensure labels are binary (0 and 1).")

Unique labels in training set: [0 1]
Unique labels in validation set: [0 1]
Labels are already in binary format (0 and 1)


In [11]:
# prepare datasets in dictionary format (expected input for PyPOTS GRU-D)
dataset_for_training = {
    "X": train_sequences['X'],
    "mask": train_sequences['mask'],
    "delta": train_sequences['delta'],
    "y": train_sequences['y']
}

dataset_for_validating = {
    "X": val_sequences['X'],
    "mask": val_sequences['mask'],
    "delta": val_sequences['delta'],
    "y": val_sequences['y']
}

print("Training dataset keys:", dataset_for_training.keys())
print("Validation dataset keys:", dataset_for_validating.keys())

Training dataset keys: dict_keys(['X', 'mask', 'delta', 'y'])
Validation dataset keys: dict_keys(['X', 'mask', 'delta', 'y'])


In [12]:
from pypots.optim import Adam
from pypots.classification import GRUD

# define model parameters
n_steps = train_sequences['X'].shape[1] # max number of time steps in input data
n_features = train_sequences['X'].shape[2] # num of dynamic features
n_classes = 2 # 2 for binary classification

# init the GRU-D model
grud = GRUD(
    n_steps=n_steps,
    n_features=n_features,
    n_classes=n_classes,
    rnn_hidden_size=32,
    batch_size=32,
    epochs=100,
    patience=10,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path="drive/MyDrive/classification/grud",
    model_saving_strategy="best",
)

print(grud)

2025-03-19 05:45:55 [INFO]: Wrote new configs to config.ini successfully.
2025-03-19 05:45:55 [INFO]: 💫 Initialized PyPOTS Ecosystem configuration file /root/.pypots/config.ini successfully.
  @autocast(enabled=False)
  @autocast(enabled=False)
2025-03-19 05:46:09 [INFO]: No given device, using default device: cpu


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



2025-03-19 05:46:09 [INFO]: Model files will be saved to drive/MyDrive/classification/grud/20250319_T054609
2025-03-19 05:46:09 [INFO]: Tensorboard file will be saved to drive/MyDrive/classification/grud/20250319_T054609/tensorboard
2025-03-19 05:46:10 [INFO]: GRUD initialized with the given hyperparameters, the number of trainable parameters: 16,128


<pypots.classification.grud.model.GRUD object at 0x7ccb26f10690>


In [13]:
# train on training set, validate on validation set
grud.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

2025-03-19 05:47:10 [INFO]: Epoch 001 - training loss: 0.4104, validation loss: 0.4036
2025-03-19 05:47:35 [INFO]: Epoch 002 - training loss: 0.3927, validation loss: 0.3920
2025-03-19 05:48:02 [INFO]: Epoch 003 - training loss: 0.3817, validation loss: 0.3882
2025-03-19 05:48:26 [INFO]: Epoch 004 - training loss: 0.3750, validation loss: 0.3842
2025-03-19 05:48:51 [INFO]: Epoch 005 - training loss: 0.3710, validation loss: 0.3798
2025-03-19 05:49:16 [INFO]: Epoch 006 - training loss: 0.3645, validation loss: 0.3798
2025-03-19 05:49:41 [INFO]: Epoch 007 - training loss: 0.3606, validation loss: 0.3739
2025-03-19 05:50:05 [INFO]: Epoch 008 - training loss: 0.3566, validation loss: 0.3801
2025-03-19 05:50:30 [INFO]: Epoch 009 - training loss: 0.3496, validation loss: 0.3787
2025-03-19 05:50:58 [INFO]: Epoch 010 - training loss: 0.3475, validation loss: 0.3683
2025-03-19 05:51:23 [INFO]: Epoch 011 - training loss: 0.3401, validation loss: 0.3777
2025-03-19 05:51:48 [INFO]: Epoch 012 - tra

In [14]:
# load the best model
grud.load('drive/MyDrive/classification/grud/20250319_T054609/GRUD.pypots')

# Prepare test dataset in dictionary format
dataset_for_testing = {
    "X": val_sequences['X'],
    "mask": val_sequences['mask'],
    "delta": val_sequences['delta'],
    "y": val_sequences['y']
}


# Predict on the testing set
grud_results = grud.predict(dataset_for_testing)
grud_prediction = grud_results["classification"]

from pypots.utils.metrics import calc_binary_classification_metrics

# Calculate binary classification metrics
metrics = calc_binary_classification_metrics(grud_prediction, dataset_for_testing["y"])

print("Testing classification metrics: \n"
      f'ROC_AUC: {metrics["roc_auc"]}, \n'
      f'PR_AUC: {metrics["pr_auc"]},\n'
      f'F1: {metrics["f1"]},\n'
      f'Precision: {metrics["precision"]},\n'
      f'Recall: {metrics["recall"]},\n')

Testing classification metrics: 
ROC_AUC: 0.7344001233679449, 
PR_AUC: 0.3148298294203433,
F1: 0.19241192411924118,
Precision: 0.4176470588235294,
Recall: 0.125,

