In [17]:
import math
import random
import warnings
from typing import Literal, NamedTuple
import os
import numpy as np
import rtdl_num_embeddings  # https://github.com/yandex-research/rtdl-num-embeddings
import scipy.special
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
from torch import Tensor
from tqdm import tqdm

warnings.simplefilter('ignore')
from tabm_reference import Model, make_parameter_groups
import polars as pl
import pandas as pd 
import gc
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import torch
import matplotlib.pyplot as plt
import seaborn as sns 
warnings.resetwarnings()
import time
import wandb
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb

warnings.filterwarnings("ignore", category=ResourceWarning)

In [2]:
def set_seed(seed):
    random.seed(seed)
    
    np.random.seed(seed)

    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed(42) 

In [3]:
def calculate_weighted_r2_by_periods(df, y_true_col, y_pred_col, weight_col, date_col='date_id', period_days=15):
    df = df.sort_values(date_col)
    date_range = df[date_col].unique()
    date_range.sort()
    results = []
    for start_idx in range(0, len(date_range), period_days):
        end_idx = start_idx + period_days
        if end_idx <= len(date_range):
            period_dates = date_range[start_idx:end_idx]
            period_data = df[df[date_col].isin(period_dates)]
            
            if len(period_data) > 0:
                weights = period_data[weight_col].values
                y_true = period_data[y_true_col].values
                y_pred = period_data[y_pred_col].values
                
                weights = weights / np.sum(weights)
                
                y_true_mean = np.sum(weights * y_true)
                numerator = np.sum(weights * (y_true - y_pred) ** 2)
                denominator = np.sum(weights * (y_true - y_true_mean) ** 2)
                
                weighted_r2 = 1 - (numerator / denominator)
                
                results.append({
                    'period_start': period_dates[0],
                    'period_end': period_dates[-1],
                    'weighted_r2': weighted_r2,
                    'n_samples': len(period_data)
                })
    
    return pd.DataFrame(results)

In [4]:
def create_daily_stats(train_df, target_col):
    daily_avg = (
        train_df
        .group_by(["symbol_id", "date_id"])
        .agg([pl.col(target_col).mean().alias("daily_avg"), 
            pl.col(target_col).std().alias("daily_std"),
            pl.col(target_col).min().alias("daily_min"),
            pl.col(target_col).max().alias("daily_max"),
            pl.col(target_col).median().alias("daily_median"),
            pl.col(target_col).skew().alias("daily_skew"),
            pl.col(target_col).kurtosis().alias("daily_kurtosis"),
            pl.col(target_col).last().alias("last_value"),
            pl.col(target_col).first().alias("first_value"),
            (pl.col(target_col).max() - pl.col(target_col).min()).alias("daily_range"),
            pl.col(target_col).sum().alias("target_sum"),
            pl.col(target_col).count().alias("daily_count")
            ]
            )
    )
    daily_avg = daily_avg.sort(["symbol_id", "date_id"])

    columns_to_shift = ["daily_avg", "daily_std", "daily_min", "daily_max", "daily_median", "daily_skew", "daily_kurtosis", "last_value", "first_value",
    "daily_range", "target_sum", "date_id", "daily_count"]

    daily_avg = daily_avg.with_columns([
        pl.col(col_name)
        .shift(1)
        .over("symbol_id")
        .alias(f"lag_1_{col_name}_{target_col}")
        for col_name in columns_to_shift
    ])

    daily_avg = daily_avg.with_columns(
    (pl.col("date_id") - pl.col("lag_1_date_id_responder_6"))
    .alias("days_since_lag_1")
    )

    s1 = [f"lag_1_{col_name}_{target_col}" for col_name in columns_to_shift if col_name != "date_id"]

    selected_cols = ["symbol_id","date_id", "days_since_lag_1"] + s1
    daily_avg = daily_avg.select(selected_cols)

    train_df = train_df.join(daily_avg,
              on=["symbol_id", "date_id"],
              how="left")

    return train_df


def create_daily_stats2(train_df, target_col):
    daily_avg = (
        train_df
        .group_by(["symbol_id", "date_id"])
        .agg([pl.col(target_col).mean().alias("daily_avg"), 
            pl.col(target_col).std().alias("daily_std"),
            pl.col(target_col).min().alias("daily_min"),
            pl.col(target_col).max().alias("daily_max"),
            pl.col(target_col).median().alias("daily_median"),
            pl.col(target_col).last().alias("last_value"),
            (pl.col(target_col).max() - pl.col(target_col).min()).alias("daily_range"),
            pl.col(target_col).sum().alias("target_sum"),
            ]
            )
    )
    daily_avg = daily_avg.sort(["symbol_id", "date_id"])

    columns_to_shift = ["daily_avg", "daily_std", "daily_min", "daily_max", "daily_median", "last_value","daily_range", "target_sum"]

    daily_avg = daily_avg.with_columns([
        pl.col(col_name)
        .shift(1)
        .over("symbol_id")
        .alias(f"lag_1_{col_name}_{target_col}")
        for col_name in columns_to_shift
    ])


    s1 = [f"lag_1_{col_name}_{target_col}" for col_name in columns_to_shift if col_name != "date_id"]

    selected_cols = ["symbol_id","date_id"] + s1
    daily_avg = daily_avg.select(selected_cols)

    train_df = train_df.join(daily_avg,
              on=["symbol_id", "date_id"],
              how="left")

    train_df = train_df.sort(["date_id", "time_id", "symbol_id"])
    gc.collect()
    return train_df


In [5]:
%%time
df_list = []
for part_in in ["6","7","8","9"]:
    part_id = part_in
    data_dir = f"train.parquet/partition_id={part_id}/part-0.parquet"
    df_list.append(pl.read_parquet(data_dir))
    gc.collect()

train_df = pl.concat(df_list)
del df_list
gc.collect()

train_df = train_df.sort(["symbol_id", "date_id", "time_id"])
responder_cols = [col for col in train_df.columns if col.startswith('responder')]

CPU times: user 1min 57s, sys: 24.1 s, total: 2min 21s
Wall time: 9.27 s


In [6]:
%%time
for col_name in ["responder_6"]:
    train_df = create_daily_stats(train_df, col_name)
gc.collect()

for col_name in ["responder_0","responder_1","responder_2","responder_3","responder_4","responder_5","responder_7","responder_8"]:
    train_df = create_daily_stats2(train_df, col_name)
gc.collect()

default_features = [f"feature_{idx:02d}" for idx in range(79)]
train_df = train_df.with_columns(null_count = pl.sum_horizontal([pl.col(col).is_null() for col in default_features]))

train_df = train_df.with_columns([
    (2 * np.pi * pl.col("time_id") / 967).sin().alias("sin_time_id").cast(pl.Float32),
    (2 * np.pi * pl.col("time_id") / 967).cos().alias("cos_time_id").cast(pl.Float32),
    (2 * np.pi * pl.col("time_id") / 483).sin().alias("sin_time_id_halfday").cast(pl.Float32),
    (2 * np.pi * pl.col("time_id") / 483).cos().alias("cos_time_id_halfday").cast(pl.Float32),
])
gc.collect()


CPU times: user 2min 59s, sys: 1min 59s, total: 4min 59s
Wall time: 19.3 s


0

In [7]:
def prepare_data_for_tabm(df, feature_cols, cat_cols, add_noise = False, noise_scale = 0.01):
    df = df.fill_nan(None)
    df = df.fill_null(0)
    numeric_cols = [x for x in feature_cols if not x in cat_cols]
    cat_cardinalities = [39, 13]

    y_test_np = df.filter(pl.col("date_id") >= 1550).select("responder_6").to_numpy().astype(np.float32)


    X_cont_train = df.filter(pl.col("date_id") < 1550).select(numeric_cols).to_numpy().astype(np.float32)
    X_cont_test = df.filter(pl.col("date_id") >= 1550).select(numeric_cols).to_numpy().astype(np.float32)

    X_cat_test = df.filter(pl.col("date_id") >= 1550).select(cat_cols).to_numpy().astype(np.int64)
    
    test_weights = df.filter(pl.col("date_id") >= 1550).select("weight").to_numpy().astype(np.float32)

    data_numpy = {
        'test': {
            'x_cont': X_cont_test,
            'x_cat': X_cat_test,
            'y': y_test_np.astype(np.float32),
            'weights': test_weights,
        },
        }

    if add_noise:
        noise = (
            np.random.default_rng(0)
            .normal(0.0, noise_scale, X_cont_train.shape)
            .astype(X_cont_train.dtype)
        )
        preprocessing = sklearn.preprocessing.StandardScaler().fit(X_cont_train + noise)
    else:
        preprocessing = sklearn.preprocessing.StandardScaler().fit(X_cont_train)
    
    for part in data_numpy:
        data_numpy[part]['x_cont'] = preprocessing.transform(data_numpy[part]['x_cont']).astype(np.float32)
    
    return data_numpy, cat_cardinalities

# TabM Model Loading

In [8]:
TABM_CONFIG = joblib.load("tabm_model_folder/tabm_v14_model_results/tabm_v14_model_config.pkl")

tabm_feature_cols = TABM_CONFIG["feature_cols"]
cat_cols =  TABM_CONFIG["cat_features"]
len(tabm_feature_cols)

115

In [9]:
data_numpy, cat_cardinalities = prepare_data_for_tabm(train_df, tabm_feature_cols, cat_cols)

In [10]:
TABM_CONFIG = joblib.load("tabm_model_folder/tabm_v14_model_results/tabm_v14_model_config.pkl")

task_type = 'regression'
n_classes = None
class RegressionLabelStats(NamedTuple):
    mean: float
    std: float

regression_label_stats = RegressionLabelStats(
    mean=-0.0014552467036992311, std=0.8571650385856628
)

amp_dtype = (
    torch.bfloat16
    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    else torch.float16
    if torch.cuda.is_available()
    else None
)
amp_enabled = True and amp_dtype is not None
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

bins = None
tab_model = Model(
    n_num_features = TABM_CONFIG["n_cont_features"],
    cat_cardinalities = TABM_CONFIG["cat_cardinalities"],
    n_classes=n_classes,
    backbone={
        'type': 'MLP',
        'n_blocks': 3 if bins is None else 2,
        'd_block': TABM_CONFIG["d_block"],
        'dropout': TABM_CONFIG["drop_rate"],
    },
    bins=bins,
    num_embeddings=(
        None
        if bins is None
        else {
            'type': 'PiecewiseLinearEmbeddings',
            'd_embedding': 16,
            'activation': False,
            'version': 'B',
        }
    ),
    arch_type=TABM_CONFIG["model_arch"],
    k=TABM_CONFIG["model_k"],
)

@torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype)  # type: ignore[code]
def apply_model(model, data, part: str, idx: Tensor) -> Tensor:
    return (
        model(
            data[part]['x_cont'][idx],
            data[part]['x_cat'][idx] if 'x_cat' in data[part] else None,
        )
        .squeeze(-1)
        .float()
    )

def inference_step(model, batch_size, device, data_numpy):
    global regression_label_stats
    model.eval()
    model.to(device)
    data = {part: {k: torch.as_tensor(v, device=device) for k, v in data_numpy[part].items()}
        for part in data_numpy}
    y_preds_list = []
    total_samples = len(data_numpy["test"]["x_cont"])
    step_count = 0
    with torch.no_grad():
        with tqdm(
            torch.arange(total_samples, device=device).split(batch_size),
            desc="Testing",
            total=math.ceil(total_samples / batch_size),
            leave=True,
        ) as pbar:

            for batch_idx in pbar:  
                step_count += 1
                y_pred = apply_model(tab_model, data, 'test', batch_idx)
                y_preds_cpu = y_pred.detach().cpu().numpy()
                y_preds_cpu = y_preds_cpu * regression_label_stats.std + regression_label_stats.mean
                y_preds_cpu = y_preds_cpu.mean(axis = 1)
                y_preds_cpu = np.clip(y_preds_cpu, -5, 5)

                y_preds_list.append(y_preds_cpu)


    y_preds_list = np.concatenate(y_preds_list)

    return y_preds_list

In [11]:
tab_model.load_state_dict(torch.load("tabm_experiment_models/tabm_73564_best.pt", map_location = device))
tab_model.to(device)

  tab_model.load_state_dict(torch.load("tabm_experiment_models/tabm_73564_best.pt", map_location = device))


Model(
  (cat_module): OneHotEncoding0d()
  (backbone): MLP(
    (blocks): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=170, out_features=128, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
      )
      (1-2): 2 x Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (minimal_ensemble_adapter): ScaleEnsemble()
  (output): NLinear()
)

In [12]:
y_preds_tabm = inference_step(model=tab_model, batch_size=8096, data_numpy=data_numpy, device=device)

r2_score(data_numpy["test"]["y"], y_preds_tabm, sample_weight = data_numpy["test"]["weights"])

Testing: 100%|██████████| 684/684 [00:01<00:00, 390.08it/s]


0.009126475088499397

## Catboost Model

In [13]:
api = wandb.Api()

run = api.run("turkenm/js_catboost/my5jxd8c")

config = run.config

summary = run.summary

files = run.files()
for file in files:
    if file.name.endswith(".cbm"):
        model_name = file.name
        file.download(root = "cb_train_models",replace = False, exist_ok = True)
        print(f"{file.name} downloaded")

cb_model = CatBoostRegressor()
cb_model.load_model(f"cb_train_models/{model_name}")
cb_feature_cols = config["feature_cols"]

cb_81156.cbm downloaded


In [14]:
X_test = train_df.filter(pl.col("date_id") >= 1550).select(cb_feature_cols).to_pandas()
y_test = train_df.filter(pl.col("date_id") >= 1550).select("responder_6").to_pandas()
test_weights = train_df.filter(pl.col("date_id") >= 1550).select("weight").to_pandas()
test_data = Pool(X_test, y_test, weight = test_weights, cat_features = ["symbol_id", "feature_09", "feature_11"])

In [15]:
y_preds_cb = cb_model.predict(test_data)
r2_score(y_test, y_preds_cb, sample_weight = test_weights)

0.008764811167079056

## LGB Model

In [18]:
lgb_version = "v8"
lgb_model = lgb.Booster(model_file=f"lgb_model_results/lgb_{lgb_version}_train.txt")
lgb_model_cols = joblib.load(f"model_cols/lgb_{lgb_version}_model_cols.pkl")

In [25]:
X_test2 = train_df.filter(pl.col("date_id") >= 1550).select(lgb_model_cols).to_pandas()
X_test2["symbol_id"] = X_test2["symbol_id"].astype("category")
X_test2["feature_09"] = X_test2["feature_09"].astype("category")
X_test2["feature_10"] = X_test2["feature_10"].astype("category")
X_test2["feature_11"] = X_test2["feature_11"].astype("category")
y_preds_lgb = lgb_model.predict(X_test2[lgb_model_cols])

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])


In [27]:
tmp_df = train_df.filter(pl.col("date_id") >= 1550).select(["date_id","time_id", "symbol_id","weight","responder_6"]).to_pandas()
tmp_df["cb_preds"] = y_preds_cb
tmp_df["tabm_preds"] = y_preds_tabm
tmp_df["lgb_preds"] = y_preds_lgb
tmp_df["tabm_cb"] = y_preds_cb * 0.5 + y_preds_tabm * 0.5
tmp_df["tabm_cb_lgb"] = (y_preds_tabm + y_preds_cb + y_preds_lgb) / 3

In [34]:
last_122_days = tmp_df.query("date_id > 1576")
last_40_days = tmp_df.query("date_id > 1658")

In [31]:
r2_score(last_122_days.responder_6, last_122_days.cb_preds, sample_weight = last_122_days.weight)

0.00903990588059611

In [32]:
r2_score(last_122_days.responder_6, last_122_days.tabm_preds, sample_weight = last_122_days.weight)

0.010110281111514907

In [35]:
r2_score(last_122_days.responder_6, last_122_days.tabm_cb, sample_weight = last_122_days.weight)

0.010635510678499771

In [36]:
r2_score(last_122_days.responder_6, last_122_days.lgb_preds, sample_weight = last_122_days.weight)

0.0076311550928545024

In [37]:
r2_score(last_122_days.responder_6, last_122_days.tabm_cb_lgb, sample_weight = last_122_days.weight)

0.010079655363787854

In [38]:
r2_score(last_40_days.responder_6, last_40_days.cb_preds, sample_weight = last_40_days.weight)

0.007758484005299882

In [39]:
r2_score(last_40_days.responder_6, last_40_days.tabm_preds, sample_weight = last_40_days.weight)

0.008220072440771498

In [40]:
r2_score(last_40_days.responder_6, last_40_days.lgb_preds, sample_weight = last_40_days.weight)

0.007381033884956034

In [41]:
r2_score(last_40_days.responder_6, last_40_days.tabm_cb_lgb, sample_weight = last_40_days.weight)

0.008761453858847412

In [42]:
r2_score(last_40_days.responder_6, last_40_days.tabm_cb, sample_weight = last_40_days.weight)

0.008871022450225863

In [43]:
r2_score(last_40_days.responder_6, last_40_days.tabm_preds, sample_weight = last_40_days.weight)

0.008220072440771498

In [22]:
calculate_weighted_r2_by_periods(tmp_df, "responder_6", "tabm_preds", "weight", period_days = 30)

Unnamed: 0,period_start,period_end,weighted_r2,n_samples
0,1550,1579,0.00487,1116104
1,1580,1609,0.015151,1109328
2,1610,1639,0.006937,1111264
3,1640,1669,0.007288,1116104
