In [None]:
################################################################################
# GRU-D - Factorial Experiment (Simulated Data, no threshold sweep)
################################################################################

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Imports and Setup
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!pip install rpy2==3.5.1 --quiet

import itertools
import time
import torch
import numpy as np
import pandas as pd
import os
import math

from sklearn.metrics import roc_curve, auc

from google.colab import drive
drive.mount('/content/drive')

%load_ext rpy2.ipython
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
pandas2ri.activate()

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import copy

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/201.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m194.6/201.7 kB[0m [31m60.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m194.6/201.7 kB[0m [31m60.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.7/201.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rpy2 (setup.py) ... [?25l[?25hdone
Mounted at /content/drive


In [None]:
################################################################################
#  DGP script path and load
################################################################################
%%R
source("drive/MyDrive/sim_dgp.R")

In [None]:
import numpy as np

def create_grud_format_pivot(df_vars, df_outcomes, feature_list, max_time_steps=300):
    """
    Build (X, mask, delta, y) for GRU-D by pivoting each subject's data (time,variable->value)
    into a consistent wide format, computing sorted time steps + deltas, then padding/truncating
    the final sequences. If all subjects' actual sequences are shorter than max_time_steps,
    we dynamically reduce to that actual max length.

    :param df_vars:       DataFrame with columns [ID, time, variable, value]
    :param df_outcomes:   DataFrame with columns [ID, time, Y] (plus possibly 'true_prob')
    :param feature_list:  List of feature names, e.g. ["X1", "X2", "X3", ...]
    :param max_time_steps: The absolute maximum sequence length we allow to keep;
                           if the actual largest sequence among all subjects is smaller,
                           we use the smaller length to avoid excess zero-padding.
    :return: X, mask, delta, y (all np.ndarrays), with shapes:
             - X.shape     = (N, final_time_steps, p)
             - mask.shape  = (N, final_time_steps, p)
             - delta.shape = (N, final_time_steps, p)
             - y.shape     = (N,)
    """

    # first pass: gather each subject's sequence, store in lists
    feature_list = sorted(feature_list)

    subject_ids = df_vars['ID'].unique()
    subject_ids.sort()
    p = len(feature_list)

    X_list = []
    mask_list = []
    delta_list = []
    y_list = []

    for sid in subject_ids:
        df_sub = df_vars[df_vars['ID'] == sid].copy()
        df_sub_out = df_outcomes[df_outcomes['ID'] == sid]

        if len(df_sub_out) == 0:
            continue

        y_val = df_sub_out['Y'].values[0]

        wide_df = df_sub.pivot_table(index='time', columns='variable', values='value')
        wide_df = wide_df.sort_index().reset_index()

        wide_df = wide_df.reindex(columns=['time'] + feature_list)

        time_vals = wide_df['time'].values
        feature_vals = wide_df[feature_list].values

        if len(time_vals) == 0:
            continue

        delta_time = np.diff(time_vals, prepend=time_vals[0])
        delta_time = np.where(delta_time == 0, 1.0, delta_time)

        mask_vals = ~np.isnan(feature_vals)
        mask_vals = mask_vals.astype(float)

        feature_vals = np.nan_to_num(feature_vals, nan=0.0)

        delta_vals = np.tile(delta_time.reshape(-1, 1), (1, p))

        X_list.append(feature_vals)
        mask_list.append(mask_vals)
        delta_list.append(delta_vals)
        y_list.append(y_val)

    # Second pass: find the actual max sequence length across all subjects
    if len(X_list) == 0:
        return (np.array([]), np.array([]), np.array([]), np.array([]))

    largest_actual_len = max(x.shape[0] for x in X_list)
    final_time_steps = min(largest_actual_len, max_time_steps)

    # Third pass: allocate arrays with final_time_steps, then pad/truncate
    N = len(X_list)
    X_out = np.zeros((N, final_time_steps, p), dtype=float)
    M_out = np.zeros((N, final_time_steps, p), dtype=float)
    D_out = np.zeros((N, final_time_steps, p), dtype=float)
    Y_out = np.array(y_list).astype(int)

    for i in range(N):
        seq_len = X_list[i].shape[0]
        use_len = min(seq_len, final_time_steps)

        X_out[i, :use_len, :] = X_list[i][:use_len, :]
        M_out[i, :use_len, :] = mask_list[i][:use_len, :]
        D_out[i, :use_len, :] = delta_list[i][:use_len, :]

    return X_out, M_out, D_out, Y_out

In [None]:
import math

class GRUD(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        output_size=1,
        x_mean=None,
        dropout=0.0,
        dropout_type='mloss',
    ):
        """
        input_size: Dimensionality of the features (D).
        hidden_size: Dimensionality of the hidden state (H).
        output_size: Usually 1 for binary classification.
        x_mean: If given, must be a 1D tensor of shape (D,) containing the mean of each feature.
        """
        super(GRUD, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.dropout_type = dropout_type

        if x_mean is None:
            self.register_buffer("x_mean", torch.zeros(input_size))
        else:
            if isinstance(x_mean, np.ndarray):
                x_mean = torch.from_numpy(x_mean).float()
            self.register_buffer("x_mean", x_mean)


        # --------------------------
        # Define the "decay" linear layers
        #    W_dg_x:  (D -> D),   W_dg_h: (D -> H).
        #    For compute gamma_x(t) and gamma_h(t) from Delta_t
        # --------------------------
        self.W_dg_x = nn.Linear(input_size, input_size, bias=True)
        self.W_dg_h = nn.Linear(input_size, hidden_size, bias=True)

        # --------------------------
        # GRU gates: z, r, h_tilde
        #    We keep the "m_t" gate terms as in Che et al.
        # --------------------------
        self.w_xz = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.w_hz = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.w_mz = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.b_z  = nn.Parameter(torch.Tensor(hidden_size))

        self.w_xr = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.w_hr = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.w_mr = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.b_r  = nn.Parameter(torch.Tensor(hidden_size))

        self.w_xh = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.w_hh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.w_mh = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.b_h  = nn.Parameter(torch.Tensor(hidden_size))

        self.w_hy = nn.Parameter(torch.Tensor(hidden_size, output_size))
        self.b_y  = nn.Parameter(torch.Tensor(output_size))

        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for p in self.parameters():
            nn.init.uniform_(p, -stdv, stdv)

    def forward(self, X, M, Delta):
        """
        X:     (batch_size, T, D)   - raw features
        M:     (batch_size, T, D)   - mask (1 if observed, 0 if missing)
        Delta: (batch_size, T, D)   - time since last observation for each feature
        Returns:
            (batch_size,) or (batch_size, 1) if output_size=1
        """
        batch_size, T, D = X.size()
        H = self.hidden_size

        h = X.new_zeros(batch_size, H)
        x_hat = X.new_zeros(batch_size, D)

        dropout_layer = nn.Dropout(self.dropout)

        for t in range(T):
            x_t = X[:, t, :]
            m_t = M[:, t, :]
            d_t = Delta[:, t, :]

            # ----------------------------------------
            # Compute gamma_x(t) and gamma_h(t)
            #    gamma_x: (batch_size, D)
            #    gamma_h: (batch_size, H)
            # ----------------------------------------
            gamma_x_t = torch.exp(-F.relu(self.W_dg_x(d_t)))
            gamma_h_t = torch.exp(-F.relu(self.W_dg_h(d_t)))

            # ----------------------------------------
            # Update x_hat(t)
            #    x_hat(t) = m_t * x_t  + (1-m_t)* [ gamma_x_t*x_hat(t-1) + (1-gamma_x_t)* x_mean ]
            # ----------------------------------------
            current_x_mean = self.x_mean.to(x_hat.device)
            x_hat = m_t * x_t + (1 - m_t) * (gamma_x_t * x_hat + (1 - gamma_x_t) * current_x_mean)


            # ----------------------------------------
            # Decay hidden state: h(t-1) -> gamma_h(t) * h(t-1)
            # ----------------------------------------
            h = gamma_h_t * h

            # ----------------------------------------
            # Standard GRU gating on the imputed input x_hat(t)
            # ----------------------------------------
            z = torch.sigmoid(
                x_hat @ self.w_xz + h @ self.w_hz + m_t @ self.w_mz + self.b_z
            )

            r = torch.sigmoid(
                x_hat @ self.w_xr + h @ self.w_hr + m_t @ self.w_mr + self.b_r
            )

            h_tilde = torch.tanh(
                x_hat @ self.w_xh + (r * h) @ self.w_hh + m_t @ self.w_mh + self.b_h
            )

            if self.dropout > 0:
                if self.dropout_type == 'mloss':
                    h_tilde = dropout_layer(h_tilde)
                elif self.dropout_type == 'Gal':
                    h = dropout_layer(h)

            h = (1 - z) * h + z * h_tilde

        y_out = h @ self.w_hy + self.b_y
        y_out = torch.sigmoid(y_out)
        return y_out.squeeze(-1)

In [None]:
class SimDataset(Dataset):
    def __init__(self, X, mask, delta, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.M = torch.tensor(mask, dtype=torch.float32)
        self.D = torch.tensor(delta, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.M[idx], self.D[idx], self.y[idx]


In [None]:
################################################################################
# Configuration for the sim data
################################################################################
class SimDataConfig:
    def __init__(self, p):
        self.feature_list   = [f"X{i+1}" for i in range(p)]
        # GRU-D hyperparams
        self.rnn_hidden_size = 32
        self.batch_size      = 32
        self.epochs         = 60
        self.patience       = 15
        self.learning_rate  = 1e-3
        self.max_t_steps    = 50


In [None]:
def run_single_scenario(
    N, p,
    regularity,
    missing_prop,
    num_sync_groups,
    process_types,
    use_lags,
    block_missing=True,
    block_length=1.5,
    outcome_time=10,
    master_seed=511
):
    """
    1) Simulate data in R.
    2) 70/15/15 split => train/val/test.
    3) Train GRU-D with PyPOTS's built-in early stopping.
    4) Evaluate on test set => standard metrics from PyPOTS (no threshold sweep).
    """
    # Simulate data in R
    ro.r.assign("N", N)
    ro.r.assign("p", p)
    ro.r.assign("regularity", regularity)
    ro.r.assign("missing_prop", missing_prop)
    ro.r.assign("num_sync_groups", num_sync_groups)

    if isinstance(process_types, list):
        ro.r.assign("process_types", ro.StrVector(process_types))
    else:
        ro.r.assign("process_types", ro.StrVector([process_types]*p))

    ro.r.assign("use_lags", use_lags)
    ro.r.assign("block_missing", block_missing)
    ro.r.assign("block_length", block_length)
    ro.r.assign("outcome_time", outcome_time)
    ro.r.assign("master_seed", master_seed)

    ro.r('''
        sim_dataset <- simulate_experiment_dataset(
          N            = N,
          p            = p,
          freq_min     = 0.1,
          freq_max     = 50.0,
          regularity   = regularity,
          num_sync_groups = num_sync_groups,
          process_types = process_types,
          missing_prop = missing_prop,
          block_missing = block_missing,
          block_length  = block_length,
          outcome_time  = outcome_time,
          beta          = c(0, seq(-1, 1, length.out = 10)),
          use_lags      = use_lags,
          lag_hours     = ifelse(use_lags, 2, 0),
          master_seed   = master_seed
        )
    ''')

    df_vars_r     = ro.r('sim_dataset$vars_long')
    df_outcomes_r = ro.r('sim_dataset$outcome_df')

    df_vars     = df_vars_r
    df_outcomes = df_outcomes_r

    df_vars["ID"]       = df_vars["ID"].astype(int)
    df_vars["time"]     = df_vars["time"].astype(float)
    df_outcomes["ID"]   = df_outcomes["ID"].astype(int)
    df_outcomes["time"] = df_outcomes["time"].astype(float)

    mean_true_prob = df_outcomes["true_prob"].mean()
    prop_positive  = df_outcomes["Y"].mean()

    # 70/15/15 split
    config = SimDataConfig(p)


    X_all, mask_all, delta_all, y_all = create_grud_format_pivot(
    df_vars, df_outcomes, config.feature_list, max_time_steps=config.max_t_steps
    )

    N_total = len(X_all)
    idx = np.arange(N_total)
    np.random.seed(master_seed)
    np.random.shuffle(idx)

    N_train = int(0.70 * N_total)
    N_val   = int(0.15 * N_total)
    N_test  = N_total - N_train - N_val

    train_idx = idx[:N_train]
    val_idx   = idx[N_train:N_train+N_val]
    test_idx  = idx[N_train+N_val:]

    X_train    = X_all[train_idx]
    mask_train = mask_all[train_idx]
    delta_train= delta_all[train_idx]
    y_train    = y_all[train_idx]

    X_val      = X_all[val_idx]
    mask_val   = mask_all[val_idx]
    delta_val  = delta_all[val_idx]
    y_val      = y_all[val_idx]

    X_test     = X_all[test_idx]
    mask_test  = mask_all[test_idx]
    delta_test = delta_all[test_idx]
    y_test     = y_all[test_idx]

    train_dataset = SimDataset(X_train, mask_train, delta_train, y_train)
    val_dataset   = SimDataset(X_val, mask_val, delta_val, y_val)
    test_dataset  = SimDataset(X_test, mask_test, delta_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=0)
    val_loader   = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=0)
    test_loader  = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=0)

    # train Custom GRU-D w/ Manual Early Stopping
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    n_features = X_train.shape[2]

    train_mask_flat = mask_train.reshape(-1, n_features)
    train_X_flat = X_train.reshape(-1, n_features)
    x_train_means = np.zeros(n_features)
    for feat_idx in range(n_features):
        masked_vals = train_X_flat[train_mask_flat[:, feat_idx].astype(bool), feat_idx]
        if len(masked_vals) > 0:
            x_train_means[feat_idx] = np.mean(masked_vals)

    model = GRUD(
        input_size=n_features,
        hidden_size=config.rnn_hidden_size,
        output_size=1,
        x_mean=x_train_means,
        dropout=0.05,
        dropout_type='mloss'
    ).to(device)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    epochs = config.epochs
    patience = config.patience
    best_val_auc = 0.0
    epochs_no_improve = 0
    best_model_weights = copy.deepcopy(model.state_dict())

    print(f"Starting training for {epochs} epochs with patience {patience}...")
    for epoch in range(1, epochs + 1):
        #  Training Phase
        model.train()
        train_loss_list = []
        train_preds_list = []
        train_truth_list = []

        for Xb, Mb, Db, yb in train_loader:
            Xb, Mb, Db, yb = Xb.to(device), Mb.to(device), Db.to(device), yb.to(device)

            optimizer.zero_grad()
            outputs = model(Xb, Mb, Db)
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()

            train_loss_list.append(loss.item())
            train_preds_list.append(outputs.detach().cpu().numpy())
            train_truth_list.append(yb.detach().cpu().numpy())

        train_loss = np.mean(train_loss_list)
        train_preds_epoch = np.concatenate(train_preds_list)
        train_truth_epoch = np.concatenate(train_truth_list)
        try:
            train_auc = roc_auc_score(train_truth_epoch, train_preds_epoch)
        except ValueError:
            train_auc = 0.5

        #  Validation Phase
        model.eval()
        val_loss_list = []
        val_preds_list = []
        val_truth_list = []
        with torch.no_grad():
            for Xb, Mb, Db, yb in val_loader:
                Xb, Mb, Db, yb = Xb.to(device), Mb.to(device), Db.to(device), yb.to(device)
                outputs = model(Xb, Mb, Db)
                loss = criterion(outputs, yb)
                val_loss_list.append(loss.item())
                val_preds_list.append(outputs.cpu().numpy())
                val_truth_list.append(yb.cpu().numpy())

        val_loss = np.mean(val_loss_list)
        val_preds_epoch = np.concatenate(val_preds_list)
        val_truth_epoch = np.concatenate(val_truth_list)
        try:
            val_auc = roc_auc_score(val_truth_epoch, val_preds_epoch)
        except ValueError:
            val_auc = 0.5

        print(f"Epoch [{epoch}/{epochs}] Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f} | Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}")

        #  Early Stopping Check
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_model_weights = copy.deepcopy(model.state_dict())
            epochs_no_improve = 0
            print(f"  -> New best validation AUC: {best_val_auc:.4f}")
        else:
            epochs_no_improve += 1
            print(f"  -> Validation AUC did not improve. ({epochs_no_improve}/{patience})")

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after epoch {epoch}.")
            break

    print(f"Finished training. Loading best model with Val AUC: {best_val_auc:.4f}")
    model.load_state_dict(best_model_weights)

    #Evaluate on test set
    model.eval()
    test_preds_list = []
    test_truth_list = []
    with torch.no_grad():
        for Xb, Mb, Db, yb in test_loader:
            Xb, Mb, Db, yb = Xb.to(device), Mb.to(device), Db.to(device), yb.to(device)
            outputs = model(Xb, Mb, Db)
            test_preds_list.append(outputs.cpu().numpy())
            test_truth_list.append(yb.cpu().numpy())

    y_pred_prob_test = np.concatenate(test_preds_list)
    y_true_test = np.concatenate(test_truth_list)

    test_auc = roc_auc_score(y_true_test, y_pred_prob_test)

    threshold = 0.5
    y_pred_test = (y_pred_prob_test >= threshold).astype(int)

    test_accuracy = accuracy_score(y_true_test, y_pred_test)
    test_prec = precision_score(y_true_test, y_pred_test, zero_division=0)
    test_rec = recall_score(y_true_test, y_pred_test, zero_division=0)
    test_f1 = f1_score(y_true_test, y_pred_test, zero_division=0)
    test_loss = float('nan')

    results_dict = {
        "test_loss":      test_loss,
        "auc":            test_auc,
        "accuracy":       test_accuracy,
        "precision":      test_prec,
        "recall":         test_rec,
        "f1":             test_f1,
        "mean_true_prob": float(mean_true_prob),
        "prop_positive":  float(prop_positive)
    }

    return results_dict

In [None]:
################################################################################
#Main Factorial Loop (new version)
################################################################################

import math

Ns = [200, 1000]
regularities = [0.2, 0.8]
missing_props = [0.1, 0.4]
sync_pattern_values = [1, "p"]
process_type_values = ["homogeneous", "mixed"]
outcome_dep_values = ["direct", "lagged"]

scenario_grid = list(itertools.product(
    Ns,
    regularities,
    missing_props,
    sync_pattern_values,
    process_type_values,
    outcome_dep_values
))

all_results = []
overall_start = time.time()

for scenario in scenario_grid:
    (N,
     regularity,
     missing_prop,
     sync_pattern,
     process_type_flag,
     outcome_dep) = scenario

    p = 10
    if process_type_flag == "homogeneous":
        process_types = ["ar1"]*p
    else:
        repeats = math.ceil(p/3)
        big_array = (["ar1","rw","seasonal"]*repeats)[:p]
        process_types = big_array

    if sync_pattern == "p":
        num_sync_groups = max(1, int(round(math.sqrt(p))))
    else:
        num_sync_groups = 1

    use_lags = (outcome_dep=="lagged")

    print("===================================================================")
    print(f"Scenario => N={N}, reg={regularity}, missing={missing_prop}, "
          f"sync={sync_pattern}, process={process_type_flag}, outcome={outcome_dep}")
    print("===================================================================")

    start_time = time.time()
    results_dict = run_single_scenario(
        N=N,
        p=p,
        regularity=regularity,
        missing_prop=missing_prop,
        num_sync_groups=num_sync_groups,
        process_types=process_types,
        use_lags=use_lags,
        block_missing=True,
        block_length=1.5,
        outcome_time=10,
        master_seed=511
    )
    runtime = time.time() - start_time

    scenario_result = {
        "N":              N,
        "regularity":     regularity,
        "missing_prop":   missing_prop,
        "sync_pattern":   sync_pattern,
        "process_type":   process_type_flag,
        "outcome_dep":    outcome_dep,
        "test_loss":      results_dict["test_loss"],
        "auc":            results_dict["auc"],
        "accuracy":       results_dict["accuracy"],
        "precision":      results_dict["precision"],
        "recall":         results_dict["recall"],
        "f1":             results_dict["f1"],
        "runtime_sec":    runtime,
        "mean_true_prob": results_dict["mean_true_prob"],
        "prop_positive":  results_dict["prop_positive"]
    }
    all_results.append(scenario_result)

overall_runtime = time.time() - overall_start
print(f"\nAll scenarios completed in {overall_runtime/60:.2f} minutes.\n")

df_results = pd.DataFrame(all_results)
df_results.sort_values(by=["N","regularity","missing_prop"], inplace=True)
print(df_results)

df_results.to_csv("drive/MyDrive/grud_factorial_results_long.csv", index=False)
print("\nResults saved to grud_factorial_results_long.csv")

Scenario => N=200, reg=0.2, missing=0.1, sync=1, process=homogeneous, outcome=direct
Using device: cuda
Starting training for 60 epochs with patience 15...
Epoch [1/60] Train Loss: 0.7141, Train AUC: 0.4595 | Val Loss: 0.7043, Val AUC: 0.5600
  -> New best validation AUC: 0.5600
Epoch [2/60] Train Loss: 0.7013, Train AUC: 0.4949 | Val Loss: 0.6880, Val AUC: 0.5600
  -> Validation AUC did not improve. (1/15)
Epoch [3/60] Train Loss: 0.6931, Train AUC: 0.5277 | Val Loss: 0.6801, Val AUC: 0.5500
  -> Validation AUC did not improve. (2/15)
Epoch [4/60] Train Loss: 0.6831, Train AUC: 0.5498 | Val Loss: 0.6821, Val AUC: 0.4600
  -> Validation AUC did not improve. (3/15)
Epoch [5/60] Train Loss: 0.6844, Train AUC: 0.5714 | Val Loss: 0.6699, Val AUC: 0.5250
  -> Validation AUC did not improve. (4/15)
Epoch [6/60] Train Loss: 0.6800, Train AUC: 0.5988 | Val Loss: 0.6726, Val AUC: 0.4550
  -> Validation AUC did not improve. (5/15)
Epoch [7/60] Train Loss: 0.6792, Train AUC: 0.6337 | Val Loss: 0.