In [1]:
import os
import copy
import glob
import time
import pytz
import torch
import random
import shutil
import psutil
import numpy as np
import pandas as pd
import polars as pl
import torch.nn as nn
import kaggle_evaluation.jane_street_inference_server

from tqdm import tqdm
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset, Dataset

# Define EST timezone
est = pytz.timezone('US/Eastern')
notebook_start_time = datetime.now(est)
time_to_minutes = notebook_start_time.strftime("%Y-%m-%d %H:%M")
print("NoteBook Start Time", time_to_minutes)

# Set the random seed for reproducibility
seed = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NoteBook Start Time 2025-01-12 22:36


In [2]:
# Define the R² metric as a loss function (inverted for minimization)
def weighted_r2_loss(y_true, y_pred, weights):
    """
    Calculate the weighted R² metric as a loss for minimization.

    Args:
        y_true (torch.Tensor): Ground truth values, shape (batch_size,).
        y_pred (torch.Tensor): Predicted values, shape (batch_size,).
        weights (torch.Tensor): Sample weights, shape (batch_size,).

    Returns:
        torch.Tensor: Weighted R² value (negative for minimization).
    """
    numerator = torch.sum(weights * (y_true - y_pred) ** 2)
    denominator = torch.sum(weights * y_true ** 2) + 1e-38
    r2 = 1 - numerator / denominator
    return -r2  # Negative for minimization

# Define normalization functions
def z_score(col, mean_val, std_val):
    """Return z-scored column: (x - mean) / std."""
    return (col - mean_val) / std_val

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, 
                 input_dim = 81,
                 hidden_dims = [256,256,256],
                ):
        super().__init__()
        self.in_dim = input_dim
        self.epoch = 0

        layers = []
        in_dim = self.in_dim
    
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.Linear(in_dim, hidden_dim, bias=True))
            layers.append(nn.ReLU())
            in_dim = hidden_dim

        layers.append(nn.Linear(in_dim, 1))  # Output layer
        layers.append(nn.Tanh())

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        '''
        x: (batch_size, input_dim)
        '''
        return 5 * self.model(x[:, :self.in_dim]).squeeze(-1)

class RMFDataset(Dataset):
    """
    Holds X_list, y_list, w_list, each item is a tensor already.
    Avoids deep copying to save memory.
    """
    def __init__(self, X_list, y_list, w_list):
        self.X_list = X_list
        self.y_list = y_list
        self.w_list = w_list

    def __len__(self):
        return len(self.X_list)

    def __getitem__(self, idx):
        return self.X_list[idx], self.y_list[idx], self.w_list[idx]


In [4]:
dirpath_initial_parquet1 = '/kaggle/input/js-rmf-0-999'
dirpath_initial_parquet2 = '/kaggle/input/js-rmf-1000-1698'
dirpath_stats = '/kaggle/input/js-rmf-data-stats'
global_stats = (pl.read_csv(os.path.join(dirpath_stats, "global_stats_with_lags.csv")))

## Configuration
num_train_days = 500
num_gap_days = 200
num_total_days = 1699
num_splits = 10

# Training
batch_size = 1
lr = 1e-4
lr_finetune = 1e-4
lambda_reg = 1e-4
num_epochs = 9

# Define feature groups
ORIGINAL_FEATURES = [f"feature_{i:02d}" for i in range(79)]
EXTRA_FEATURES = ['weight', 'time_id']
ALL_FEATURES = ORIGINAL_FEATURES + EXTRA_FEATURES

In [5]:
global_stats_dict = {}
for row in global_stats.to_dicts():
    # row["feature"] might look like "feature_00", "weight", etc.
    global_stats_dict[row["feature"]] = {
        "mean": row["mean"],
        "std": row["std"],
        "min": row["min"],
        "max": row["max"]
    }

weight_mean = global_stats_dict['weight']['mean']
weight_std = global_stats_dict['weight']['std']
weight_min = global_stats_dict['weight']['min']
weight_max = global_stats_dict['weight']['max']
time_mean = global_stats_dict['time_id']['mean']
time_std = global_stats_dict['time_id']['std']
time_min = global_stats_dict['time_id']['min']
time_max = global_stats_dict['time_id']['max']
print(f"Weight mean: {weight_mean}, std: {weight_std}, min: {weight_min}, max: {weight_max}")
print(f"TimeID mean: {time_mean}, std: {time_std}, min: {time_min}, max: {time_max}")

def process_and_update(file_paths, num_splits=num_splits, X_list=None, y_list=None, w_list=None):
    # Avoid using mutable default arguments
    if X_list is None:
        X_list = []
    if y_list is None:
        y_list = []
    if w_list is None:
        w_list = []

    for file_path in tqdm(file_paths):
        day_df = pl.read_parquet(file_path)
        day_df = day_df.select([day_df[col].cast(pl.Float32).alias(col) for col in day_df.columns])

        for feature in ALL_FEATURES:
            stats = global_stats_dict[feature]
            mean_val = stats["mean"]
            std_val  = stats["std"]
            day_df = day_df.with_columns(
                z_score(day_df[feature], mean_val, std_val).alias(feature)
            )

        day_df = day_df.fill_null(0)
        day_df = day_df.sort("time_id")
        unique_time_ids = day_df['time_id'].unique()
        num_time_ids = len(unique_time_ids)
        split_size = num_time_ids // num_splits

        for i in range(num_splits):
            start = i * split_size
            end = (i + 1) * split_size if i < num_splits - 1 else num_time_ids
            if start >= num_time_ids:
                break
            subset_time_ids = unique_time_ids[start:end]
            subset_df = day_df.filter(pl.col("time_id").is_in(subset_time_ids))
            
            X = torch.tensor(subset_df.select(ALL_FEATURES).to_numpy(), dtype=torch.float32)
            y = torch.tensor(subset_df.select('responder_6').to_numpy(), dtype=torch.float32).squeeze()
            w = torch.tensor(subset_df.select('weight').to_numpy(), dtype=torch.float32).squeeze() * weight_std + weight_mean

            X_list.append(X)
            y_list.append(y)
            w_list.append(w)

    return X_list, y_list, w_list

initial_file_paths = [os.path.join(dirpath_initial_parquet1, f"{i}.parquet") for i in range(1000)] + [os.path.join(dirpath_initial_parquet2, f"{i}.parquet") for i in range(1000, 1699)]
initial_file_paths = initial_file_paths[num_total_days-num_train_days:num_total_days]
X_list, y_list, w_list = process_and_update(initial_file_paths)

Weight mean: 2.00944066, std: 1.129388213, min: 0.149966657, max: 10.24041939
TimeID mean: 468.7057222, std: 272.5186966, min: 0.0, max: 967.0


100%|██████████| 500/500 [00:56<00:00,  8.82it/s]


In [6]:
num_epochs = 9
input_dim = len(ALL_FEATURES)
model = MLP(input_dim=input_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

X_list_train, y_list_train, w_list_train = X_list[:num_train_days * num_splits], y_list[:num_train_days * num_splits], w_list[:num_train_days * num_splits]
assert len(X_list_train) == len(y_list_train) == len(w_list_train) == num_train_days * num_splits
train_dataset = RMFDataset(X_list_train, y_list_train, w_list_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model.train()
for epoch in tqdm(range(num_epochs)):

    total_loss = 0
    for batch, (X_batch, y_batch, w_batch) in enumerate(train_loader, 1):
        X_batch, y_batch, w_batch = X_batch[0].to(device), y_batch[0].to(device), w_batch[0].to(device)
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        l2_reg = sum(param.pow(2).sum() for param in model.parameters())
        loss = weighted_r2_loss(y_batch, y_pred, w_batch) + lambda_reg * l2_reg
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_R2 = -total_loss / len(train_loader)
        
    process=psutil.Process()
    mem=process.memory_info().rss/(1024**3); 
    print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} | Epoch {epoch + 1}: train_R² = {train_R2:.6f}, cpu memory usage={mem:.6f} GB")
            
predictor1 = copy.deepcopy(model)
predictor1.to(device)
refiner1 = torch.optim.Adam(predictor1.parameters(), lr=lr_finetune) # Optimizer for find-tuning

 11%|█         | 1/9 [00:18<02:25, 18.13s/it]

Time: 2025-01-13 03:37:33 | Epoch 1: train_R² = -0.019330, cpu memory usage=6.649105 GB


 22%|██▏       | 2/9 [00:36<02:06, 18.01s/it]

Time: 2025-01-13 03:37:51 | Epoch 2: train_R² = -0.006947, cpu memory usage=6.649334 GB


 33%|███▎      | 3/9 [00:53<01:47, 17.89s/it]

Time: 2025-01-13 03:38:09 | Epoch 3: train_R² = -0.000621, cpu memory usage=6.648209 GB


 44%|████▍     | 4/9 [01:11<01:28, 17.73s/it]

Time: 2025-01-13 03:38:27 | Epoch 4: train_R² = 0.003240, cpu memory usage=6.648235 GB


 56%|█████▌    | 5/9 [01:28<01:10, 17.69s/it]

Time: 2025-01-13 03:38:44 | Epoch 5: train_R² = 0.005579, cpu memory usage=6.649399 GB


 67%|██████▋   | 6/9 [01:46<00:52, 17.61s/it]

Time: 2025-01-13 03:39:02 | Epoch 6: train_R² = 0.007482, cpu memory usage=6.648193 GB


 78%|███████▊  | 7/9 [02:04<00:35, 17.63s/it]

Time: 2025-01-13 03:39:19 | Epoch 7: train_R² = 0.009121, cpu memory usage=6.649170 GB


 89%|████████▉ | 8/9 [02:21<00:17, 17.58s/it]

Time: 2025-01-13 03:39:37 | Epoch 8: train_R² = 0.010041, cpu memory usage=6.648239 GB


100%|██████████| 9/9 [02:39<00:00, 17.68s/it]

Time: 2025-01-13 03:39:54 | Epoch 9: train_R² = 0.011133, cpu memory usage=6.649120 GB





In [7]:
num_epochs = 10
input_dim = len(ALL_FEATURES)
model = MLP(input_dim=input_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

X_list_train, y_list_train, w_list_train = X_list[:num_train_days * num_splits], y_list[:num_train_days * num_splits], w_list[:num_train_days * num_splits]
assert len(X_list_train) == len(y_list_train) == len(w_list_train) == num_train_days * num_splits
train_dataset = RMFDataset(X_list_train, y_list_train, w_list_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model.train()
for epoch in tqdm(range(num_epochs)):

    total_loss = 0
    for batch, (X_batch, y_batch, w_batch) in enumerate(train_loader, 1):
        X_batch, y_batch, w_batch = X_batch[0].to(device), y_batch[0].to(device), w_batch[0].to(device)
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        l2_reg = sum(param.pow(2).sum() for param in model.parameters())
        loss = weighted_r2_loss(y_batch, y_pred, w_batch) + lambda_reg * l2_reg
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_R2 = -total_loss / len(train_loader)
        
    process=psutil.Process()
    mem=process.memory_info().rss/(1024**3); 
    print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} | Epoch {epoch + 1}: train_R² = {train_R2:.6f}, cpu memory usage={mem:.6f} GB")
            
predictor2 = copy.deepcopy(model)
predictor2.to(device)
refiner2 = torch.optim.Adam(predictor2.parameters(), lr=lr_finetune) # Optimizer for find-tuning

 10%|█         | 1/10 [00:17<02:37, 17.45s/it]

Time: 2025-01-13 03:40:12 | Epoch 1: train_R² = -0.018912, cpu memory usage=6.649242 GB


 20%|██        | 2/10 [00:35<02:21, 17.74s/it]

Time: 2025-01-13 03:40:30 | Epoch 2: train_R² = -0.006921, cpu memory usage=6.649242 GB


 30%|███       | 3/10 [00:53<02:04, 17.72s/it]

Time: 2025-01-13 03:40:48 | Epoch 3: train_R² = -0.000830, cpu memory usage=6.649487 GB


 40%|████      | 4/10 [01:10<01:46, 17.74s/it]

Time: 2025-01-13 03:41:05 | Epoch 4: train_R² = 0.003008, cpu memory usage=6.649487 GB


 50%|█████     | 5/10 [01:28<01:28, 17.79s/it]

Time: 2025-01-13 03:41:23 | Epoch 5: train_R² = 0.005643, cpu memory usage=6.649208 GB


 60%|██████    | 6/10 [01:46<01:10, 17.75s/it]

Time: 2025-01-13 03:41:41 | Epoch 6: train_R² = 0.007585, cpu memory usage=6.649208 GB


 70%|███████   | 7/10 [02:04<00:53, 17.79s/it]

Time: 2025-01-13 03:41:59 | Epoch 7: train_R² = 0.008984, cpu memory usage=6.648338 GB


 80%|████████  | 8/10 [02:22<00:35, 17.80s/it]

Time: 2025-01-13 03:42:17 | Epoch 8: train_R² = 0.010209, cpu memory usage=6.649315 GB


 90%|█████████ | 9/10 [02:39<00:17, 17.82s/it]

Time: 2025-01-13 03:42:35 | Epoch 9: train_R² = 0.011263, cpu memory usage=6.649437 GB


100%|██████████| 10/10 [02:57<00:00, 17.72s/it]

Time: 2025-01-13 03:42:52 | Epoch 10: train_R² = 0.012144, cpu memory usage=6.649437 GB





In [8]:
num_epochs = 11
input_dim = len(ALL_FEATURES)
model = MLP(input_dim=input_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

X_list_train, y_list_train, w_list_train = X_list[:num_train_days * num_splits], y_list[:num_train_days * num_splits], w_list[:num_train_days * num_splits]
assert len(X_list_train) == len(y_list_train) == len(w_list_train) == num_train_days * num_splits
train_dataset = RMFDataset(X_list_train, y_list_train, w_list_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model.train()
for epoch in tqdm(range(num_epochs)):

    total_loss = 0
    for batch, (X_batch, y_batch, w_batch) in enumerate(train_loader, 1):
        X_batch, y_batch, w_batch = X_batch[0].to(device), y_batch[0].to(device), w_batch[0].to(device)
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        l2_reg = sum(param.pow(2).sum() for param in model.parameters())
        loss = weighted_r2_loss(y_batch, y_pred, w_batch) + lambda_reg * l2_reg
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_R2 = -total_loss / len(train_loader)
        
    process=psutil.Process()
    mem=process.memory_info().rss/(1024**3); 
    print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} | Epoch {epoch + 1}: train_R² = {train_R2:.6f}, cpu memory usage={mem:.6f} GB")
            
predictor3 = copy.deepcopy(model)
predictor3.to(device)
refiner3 = torch.optim.Adam(predictor3.parameters(), lr=lr_finetune) # Optimizer for find-tuning

  9%|▉         | 1/11 [00:17<02:56, 17.61s/it]

Time: 2025-01-13 03:43:09 | Epoch 1: train_R² = -0.017902, cpu memory usage=6.652992 GB


 18%|█▊        | 2/11 [00:35<02:38, 17.66s/it]

Time: 2025-01-13 03:43:27 | Epoch 2: train_R² = -0.006543, cpu memory usage=6.650295 GB


 27%|██▋       | 3/11 [00:52<02:21, 17.63s/it]

Time: 2025-01-13 03:43:45 | Epoch 3: train_R² = -0.000394, cpu memory usage=6.649364 GB


 36%|███▋      | 4/11 [01:11<02:05, 17.87s/it]

Time: 2025-01-13 03:44:03 | Epoch 4: train_R² = 0.003032, cpu memory usage=6.651665 GB


 45%|████▌     | 5/11 [01:28<01:47, 17.84s/it]

Time: 2025-01-13 03:44:21 | Epoch 5: train_R² = 0.005521, cpu memory usage=6.649212 GB


 55%|█████▍    | 6/11 [01:46<01:29, 17.83s/it]

Time: 2025-01-13 03:44:39 | Epoch 6: train_R² = 0.007510, cpu memory usage=6.650585 GB


 64%|██████▎   | 7/11 [02:04<01:11, 17.86s/it]

Time: 2025-01-13 03:44:56 | Epoch 7: train_R² = 0.009056, cpu memory usage=6.649544 GB


 73%|███████▎  | 8/11 [02:23<00:54, 18.01s/it]

Time: 2025-01-13 03:45:15 | Epoch 8: train_R² = 0.010154, cpu memory usage=6.651478 GB


 82%|████████▏ | 9/11 [02:40<00:35, 17.88s/it]

Time: 2025-01-13 03:45:32 | Epoch 9: train_R² = 0.011151, cpu memory usage=6.650494 GB


 91%|█████████ | 10/11 [02:58<00:17, 17.86s/it]

Time: 2025-01-13 03:45:50 | Epoch 10: train_R² = 0.012322, cpu memory usage=6.648247 GB


100%|██████████| 11/11 [03:16<00:00, 17.84s/it]

Time: 2025-01-13 03:46:08 | Epoch 11: train_R² = 0.013187, cpu memory usage=6.651562 GB





In [9]:
df_last_day = pl.read_parquet(os.path.join(dirpath_initial_parquet2, '1698.parquet')) # Load the last day's Parquet file
df_last_day = df_last_day.select([df_last_day[col].cast(pl.Float32).alias(col) for col in df_last_day.columns])
    
for feature in ALL_FEATURES:
    stats = global_stats_dict[feature]
    mean_val = stats["mean"]
    std_val = stats["std"]
    df_last_day = df_last_day.with_columns(
        z_score(df_last_day[feature], mean_val, std_val).alias(feature)
    )

df_last_day = df_last_day.fill_null(0.0)  # Replace missing values with 0
df_last_day = df_last_day.select([df_last_day[col].cast(pl.Float32).alias(col) for col in df_last_day.columns])
df_last_day = df_last_day.sort("time_id")

In [10]:
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_, df_last_day, X_list, y_list, w_list
    global predictor1, predictor2, predictor3
    global refiner1, refiner2, refiner3
    
    start_time = time.time()

    time_id = int(test[0, "time_id"])
    date_id = int(test[0, "date_id"])
    
    not_scored = (test["is_scored"] == False).all()
    test = test.select([test[col].cast(pl.Float32).alias(col) for col in test.columns])

    if lags is not None:
        lags = lags.select([lags[col].cast(pl.Float32).alias(col) for col in lags.columns])
        lags = lags.filter(
            (lags['time_id'].is_not_null()) &
            (lags['symbol_id'].is_not_null()) &
            (lags['responder_6_lag_1'].is_not_null())
        )

        lags_ = lags
        lags_ = lags_.drop("date_id")
        lags_ = lags_.rename({col: col.replace("_lag_1", "") for col in lags_.columns if "responder" in col})    
        lags_ = lags_.with_columns(z_score(lags_['time_id'], time_mean, time_std).alias('time_id'))
        lags_ = lags_.select([lags_[col].cast(pl.Float32).alias(col) for col in lags_.columns])

    # ========== Processing for time_id == 0 ==========
    if time_id == 0:
        print('-------- Date:', date_id)
        
        # ===== At the start of each date, finalize the previous day's data, split it, and add it to the list
        merged = (df_last_day.join(lags_, on=["time_id", "symbol_id"], how="inner") if lags_ is not None else df_last_day)
        merged = merged.fill_null(0.0)
        merged = merged.sort("time_id")
        
        unique_time_ids = merged['time_id'].unique()
        num_time_ids = len(unique_time_ids)
        split_size = num_time_ids // num_splits

        for i in range(num_splits):
            start = i * split_size
            end = (i + 1) * split_size if i < num_splits - 1 else num_time_ids
            if start >= num_time_ids:
                break
            subset_time_ids = unique_time_ids[start:end]
            subset_df = merged.filter(pl.col("time_id").is_in(subset_time_ids))
            
            X = torch.tensor(subset_df.select(ALL_FEATURES).to_numpy(), dtype=torch.float32)
            y = torch.tensor(subset_df.select('responder_6').to_numpy(), dtype=torch.float32).squeeze()
            w = torch.tensor(subset_df.select('weight').to_numpy(), dtype=torch.float32).squeeze() * weight_std + weight_mean

            X_list.append(X)
            y_list.append(y)
            w_list.append(w)
                
    # ========== Process test batch ==========
    for feature in ALL_FEATURES:
        stats = global_stats_dict[feature]
        mean_val = stats["mean"]
        std_val = stats["std"]
        test = test.with_columns(
            z_score(test[feature], mean_val, std_val).alias(feature)
        )

    test = test.fill_null(0.0)
    test = test.sort("time_id")

    # Collect today's feature data and set up for the next day
    if time_id == 0:
        # Assign directly for the first time_id
        test = test.select([test[col].cast(pl.Float32).alias(col) for col in test.columns])
        df_last_day = test
        df_last_day = df_last_day.select([df_last_day[col].cast(pl.Float32).alias(col) for col in df_last_day.columns])
    else:
        # Append new_chunk to existing df_last_day
        test = test.select([test[col].cast(pl.Float32).alias(col) for col in test.columns])
        df_last_day = pl.concat([df_last_day, test], how="vertical")
        df_last_day = df_last_day.select([df_last_day[col].cast(pl.Float32).alias(col) for col in df_last_day.columns])

    # ========== Training if have time ==========
    if time_id == 0:
        X_list_train, y_list_train, w_list_train = X_list[-num_splits:], y_list[-num_splits:], w_list[-num_splits:]
        train_dataset = RMFDataset(X_list_train, y_list_train, w_list_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        
        predictor1.train()
        for batch, (X_batch, y_batch, w_batch) in enumerate(train_loader, 1):
            X_batch, y_batch, w_batch = X_batch[0].to(device), y_batch[0].to(device), w_batch[0].to(device)
            
            refiner1.zero_grad()
            y_pred = predictor1(X_batch)
            l2_reg = sum(param.pow(2).sum() for param in predictor1.parameters()) # L2 regularization
            loss = weighted_r2_loss(y_batch, y_pred, w_batch) + lambda_reg * l2_reg
            loss.backward()
            refiner1.step()
            
        print(f"Model 1 Fine tuned at date {date_id}")
        predictor1.eval()

        predictor2.train()
        for batch, (X_batch, y_batch, w_batch) in enumerate(train_loader, 1):
            X_batch, y_batch, w_batch = X_batch[0].to(device), y_batch[0].to(device), w_batch[0].to(device)
            
            refiner2.zero_grad()
            y_pred = predictor2(X_batch)
            l2_reg = sum(param.pow(2).sum() for param in predictor2.parameters()) # L2 regularization
            loss = weighted_r2_loss(y_batch, y_pred, w_batch) + lambda_reg * l2_reg
            loss.backward()
            refiner2.step()
            
        print(f"Model 2 Fine tuned at date {date_id}")
        predictor2.eval()

        predictor3.train()
        for batch, (X_batch, y_batch, w_batch) in enumerate(train_loader, 1):
            X_batch, y_batch, w_batch = X_batch[0].to(device), y_batch[0].to(device), w_batch[0].to(device)
            
            refiner3.zero_grad()
            y_pred = predictor3(X_batch)
            l2_reg = sum(param.pow(2).sum() for param in predictor3.parameters()) # L2 regularization
            loss = weighted_r2_loss(y_batch, y_pred, w_batch) + lambda_reg * l2_reg
            loss.backward()
            refiner3.step()
            
        print(f"Model 3 Fine tuned at date {date_id}")
        predictor3.eval()
            
    # ========== Inference ==========    
    if not_scored: # no need to score
        predictions = test.select(
            'row_id',
            pl.lit(0.0).alias('responder_6'),
        )
        
        predictions = predictions.with_columns(
            pl.col("row_id").cast(pl.Int64).alias("row_id"),  # Ensure 'row_id' is Int64
            pl.col("responder_6").cast(pl.Float64).alias("responder_6")  # Ensure 'responder_6' is Float64
        )

    else:
        predictor1.eval()
        predictor2.eval()
        predictor3.eval()
        with torch.no_grad():
            X_test = torch.tensor(test.select(ALL_FEATURES).to_numpy(), dtype=torch.float32).to(device)
            y_test1 = predictor1(X_test).detach().cpu().numpy().flatten()
            y_test2 = predictor2(X_test).detach().cpu().numpy().flatten()
            y_test3 = predictor3(X_test).detach().cpu().numpy().flatten()

        y_test = (y_test1 + y_test2 + y_test3) / 3.0        
        y_test = np.nan_to_num(y_test, nan=0.0, posinf=0.0, neginf=-0.0)
        
        # Create predictions DataFrame
        predictions = pl.DataFrame({
            "row_id": test["row_id"].to_numpy(),  # Convert 'row_id' to numpy for compatibility
            "responder_6": y_test.astype('float64')    # Use flattened dummy output
        })
        
        # predictions = predictions.with_columns(pl.col("responder_6").fill_null(0.0).alias("responder_6"))
        predictions = predictions.with_columns(
            pl.col("row_id").cast(pl.Int64).alias("row_id"),  # Ensure 'row_id' is Int64
            pl.col("responder_6").cast(pl.Float64).alias("responder_6")  # Ensure 'responder_6' is Float64
        )
    
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
        
    assert len(predictions) == len(test) # Confirm has as many rows as the test data.
        
    # ========== Post Assertion ==========
    # if predictions["row_id"].null_count() > 0 or predictions["responder_6"].null_count() > 0:
    #     raise AssertionError(
    #         f"Found {total_nulls} null values in the predictions DataFrame for date_id: {date_id}, time_id: {time_id}."
    #     )

    # nan_present = predictions['responder_6'].is_nan().any()
    # if nan_present:
    #     raise AssertionError(
    #         f"Found NaN values in the 'responder_6' column of the predictions DataFrame for date_id: {date_id}, time_id: {time_id}."
    #     )
    
    # print(predictions)
    return predictions

In [11]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            # '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            # '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
            '/kaggle/input/synthetic-data/synthetic_test.parquet',
            '/kaggle/input/synthetic-data/synthetic_lag.parquet',
            
        )
    )

-------- Date: 0
Model 1 Fine tuned at date 0
Model 2 Fine tuned at date 0
Model 3 Fine tuned at date 0
-------- Date: 1
Model 1 Fine tuned at date 1
Model 2 Fine tuned at date 1
Model 3 Fine tuned at date 1
-------- Date: 2
Model 1 Fine tuned at date 2
Model 2 Fine tuned at date 2
Model 3 Fine tuned at date 2
-------- Date: 3
Model 1 Fine tuned at date 3
Model 2 Fine tuned at date 3
Model 3 Fine tuned at date 3
-------- Date: 4
Model 1 Fine tuned at date 4
Model 2 Fine tuned at date 4
Model 3 Fine tuned at date 4
-------- Date: 5
Model 1 Fine tuned at date 5
Model 2 Fine tuned at date 5
Model 3 Fine tuned at date 5
-------- Date: 6
Model 1 Fine tuned at date 6
Model 2 Fine tuned at date 6
Model 3 Fine tuned at date 6
-------- Date: 7
Model 1 Fine tuned at date 7
Model 2 Fine tuned at date 7
Model 3 Fine tuned at date 7
-------- Date: 8
Model 1 Fine tuned at date 8
Model 2 Fine tuned at date 8
Model 3 Fine tuned at date 8
