# Data Split

In [None]:
from datetime import timedelta
import polars as pl
import os
DATA_DIR = 'data'
SAVE_PATH = os.path.join(DATA_DIR, 'val')
os.makedirs(os.path.join(DATA_DIR, 'val'), exist_ok=True)
EVAL_DAYS_TRESHOLD = 14

df_test_users = pl.read_parquet(os.path.join(DATA_DIR, 'test_users.pq'))
df_clickstream = pl.read_parquet(os.path.join(DATA_DIR, 'clickstream.pq'))

#df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq'))
#df_text_features = pl.read_parquet(os.path.join(DATA_DIR, 'clickstream.pq'))
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

In [6]:
treshhold = df_clickstream['event_date'].max() - timedelta(days=EVAL_DAYS_TRESHOLD)

df_train = df_clickstream.filter(df_clickstream['event_date']<= treshhold)
df_eval = df_clickstream.filter(df_clickstream['event_date']> treshhold)[['cookie', 'node', 'event']]

df_eval = (
        df_eval
        .join(df_train, on=['cookie', 'node'], how='anti')
        .filter(
                pl.col('event').is_in(
                    df_event.filter(pl.col('is_contact')==1)['event'].unique()
                )
            )
        .filter(
        pl.col('cookie').is_in(df_train['cookie'].unique())
        ).filter(
            pl.col('node').is_in(df_train['node'].unique())
        )
)
df_eval = df_eval.unique(['cookie', 'node'])

In [13]:
df_train.write_parquet(os.path.join(SAVE_PATH, 'clickstream.pq'))
df_eval.write_parquet(os.path.join(SAVE_PATH, 'gt.pq'))

# Retrieval Train

## Autoencoders

### EASE_DAN

In [1]:
from autoencoders.model import EASE_DAN
from utils import Enc, convert_to_sparse, process_in_batches, recall_at
import polars as pl
import os
import numpy as np

DATA_DIR = 'data'
VAL_PATH = os.path.join(DATA_DIR, 'val')
PREDICTION_PATH = 'predictions'
MODEL_NAME = 'EASE_DAN'
EVAL_DAYS_TRESHOLD = 14
N_ITEMS = 30_000

df_clickstream = pl.read_parquet(os.path.join(VAL_PATH, 'clickstream.pq'))
df_eval = pl.read_parquet(os.path.join(VAL_PATH, 'gt.pq')).join(df_clickstream, on='cookie', how='semi')
df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq')) 
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

df_train = df_clickstream
eval_users = df_eval['cookie'].unique().to_list()

In [None]:
enc = Enc(item_key='node', user_key='cookie')
enc_eval_users = [enc.user_id_dict.get(i) for i in eval_users]

df_train = df_train.join(df_train.unique(subset=['node', 'cookie']).select('node').group_by('node').len().sort('len').tail(N_ITEMS).drop('len'),
                    on='node')


df_train = df_train.with_columns(
        (pl.lit(1)).alias("event_weight")
    )
df_eval = df_eval.join(df_train, on='cookie', how='semi')
df_eval = df_eval.with_columns(pl.col('node').cast(pl.Int64))
result = enc.fit(train_df=df_train, event_weight='event_weight')


X = (convert_to_sparse(result, enc) > 0).astype(np.float32)
ease = EASE_DAN(num_items=N_ITEMS)
ease.fit(X)
recommendations_df = process_in_batches(
    enc_eval_users=enc_eval_users,
    X=X,
    G=ease.W,
    k=300, # top_k
    batch_size=1000,
    fill_value=-1000
)
recs = enc.inverse_transform(recommendations_df) 
recs = recs.with_columns(
    pl.col('score').rank(descending=True).over('cookie').alias(f'rank_rd'),
    pl.col('cookie').cast(pl.Int64),
    pl.col('node').cast(pl.Int64)
)
print('UNSEEN-RECALL@40', recall_at(df_eval, recs, k=40))
print('UNSEEN-RECALL@100', recall_at(df_eval, recs, k=100))

UNSEEN-RECALL@40 0.17348943701276365
UNSEEN-RECALL@100 0.2819826797115451


In [None]:
os.makedirs(os.path.join(PREDICTION_PATH, MODEL_NAME), exist_ok=True)
recs.write_parquet(os.path.join(PREDICTION_PATH, MODEL_NAME, 'EASE_DAN_VAL.pq'))

### RDLAE

In [None]:
from autoencoders.model import RDLAE
from utils import truncate, process_batch_w_weight
from utils import Enc, convert_to_sparse, process_in_batches, recall_at
import polars as pl
import os
import numpy as np
from utils import convert

DATA_DIR = 'data'
VAL_PATH = os.path.join(DATA_DIR, 'val')
PREDICTION_PATH = 'predictions'
MODEL_NAME = 'RDLAE'
N_ITEMS = 50_000
DECAY_RATE_POS = 0.01
DECAY_RATE_TIME = 0.05
BAYESSIAN_C = 100
SMOOTHED_ALPHA = 1
SMOOTHED_BETA = 2
NOISE_INJECTION = 0.2
RATIO_COLUMN = 'bayesian_ratio_C'

df_clickstream = pl.read_parquet(os.path.join(VAL_PATH, 'clickstream.pq'))
df_eval = pl.read_parquet(os.path.join(VAL_PATH, 'gt.pq')).join(df_clickstream, on='cookie', how='semi')
df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq')) 
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

df_train = df_clickstream.join(df_event, on='event', how='left')
df_train = df_train.join(df_train.filter(pl.col('is_contact')==1).unique(subset=['node', 'cookie']).select('node').group_by('node').len().sort('len').tail(N_ITEMS).drop('len'),
                            on='node')
eval_users = df_eval['cookie'].unique().to_list()

#### Making bayessian columns to boost contact ration info into collaborative filtering

In [2]:
time_diff = (df_train.select('cookie', 'node', 'event_date', 'is_contact')
                           .with_columns(pl.col('event_date').dt.truncate("1d").alias('date'))
                           .group_by('cookie', 'node').agg(pl.col('event_date').max().alias('node_last_visit'),
                                                           pl.col('event_date').min().alias('node_first_visit'),
                                                           (pl.col('date').n_unique()-1).alias('n_days_clicks'),
                                                           pl.col('date').filter(pl.col('is_contact') > 0).n_unique().alias('n_days_contacts'),
                                                           pl.col('is_contact').sum().cast(pl.Int32),
                                                           pl.len().alias('cnt'),
                                                           ))
time_diff = time_diff.with_columns(
    ( -DECAY_RATE_POS * (pl.col('node_last_visit').rank(descending=True).over('cookie')-1)).exp().cast(pl.Float32).alias(f'exp_pos'),
    ( DECAY_RATE_TIME * ( pl.col('node_last_visit') - df_train.select('event_date').max() ).dt.total_days().cast(pl.Int64)).exp().cast(pl.Float32).alias('exp_time')
)

In [3]:
enc = Enc(item_key='node', user_key='cookie')
train = enc.fit(train_df=df_train.with_columns(event_weight=1.), event_weight='event_weight')
num_users, num_items = enc.get_num()
enc_eval_users = [enc.user_id_dict.get(i) for i in eval_users]

X = convert_to_sparse(train, enc)

n2n = pl.DataFrame({'node':enc.item_id_dict.keys(),   'le_node':enc.item_id_dict.values()})
c2c = pl.DataFrame({'cookie':enc.user_id_dict.keys(), 'le_cookie':enc.user_id_dict.values()})

train_sum = time_diff.join(n2n, on='node').join(c2c, on='cookie').drop('node_last_visit', 'node_first_visit')
train_sum.filter(pl.col('cookie')==1).tail(2)

cookie,node,n_days_clicks,n_days_contacts,is_contact,cnt,exp_pos,exp_time,le_node,le_cookie
i64,u32,u32,u32,i32,u32,f32,f32,i64,i64
1,262019,0,0,0,2,0.103312,0.740818,36530,1
1,214338,1,0,0,4,0.177284,0.778801,28726,1


In [4]:
extra_user_features = time_diff.group_by('cookie').agg(pl.col('n_days_clicks').sum().alias('sum_n_days_clicks'),
                                 pl.col('n_days_clicks').max().alias('max_n_days_clicks'),
                                 pl.col('n_days_contacts').sum().alias('sum_n_days_contacts'),
                                 pl.col('n_days_contacts').max().alias('max_n_days_contacts'),
                                 pl.col('is_contact').sum().alias('sum_is_contact'),
                                (pl.col('is_contact').sum() / pl.col('cnt').sum()).alias('user_contact_ratio'),
                                (pl.col('exp_pos') * pl.col('is_contact')).sum().alias('exp_pos_contact'),
                                (pl.col('exp_time') * pl.col('is_contact')).sum().alias('exp_time_contact'),
                                )
extra_user_features = extra_user_features.with_columns(
    pl.col('user_contact_ratio').cast(pl.Float32),
    pl.col('exp_pos_contact').cast(pl.Float32),
    pl.col('exp_time_contact').cast(pl.Float32),
)

In [None]:
default_contact_ratio =(df_train
                .select('cookie', 'node', 'is_contact', 'event_date').sort('is_contact')
                .unique(subset=['cookie', 'node', 'is_contact'], keep='last')
                .select('node', 'is_contact').with_columns(value=pl.lit(1))
                .pivot(
                    values="value",
                    index="node",
                    columns="is_contact",
                    aggregate_function="sum",
                ).fill_null(0)).with_columns(pl.col('0').cast(pl.Int32).alias('node_contacts_0'),
                                            pl.col('1').cast(pl.Int32).alias('node_contacts_1'),
                                            ).drop('0', '1')

train_nodes = train_sum.select('node').unique()
node_ratio = default_contact_ratio.join(train_nodes, on='node').with_columns(
    bayesian_ratio = (BAYESSIAN_C * (pl.col('node_contacts_1').sum() / pl.col('node_contacts_0').sum()) + pl.col('node_contacts_1')) / (BAYESSIAN_C + pl.col('node_contacts_0')),
    bayesian_ratio_C = (pl.col('node_contacts_0').mean() * (pl.col('node_contacts_1').sum() / pl.col('node_contacts_0').sum()) + pl.col('node_contacts_1')) / (pl.col('node_contacts_0').mean() + pl.col('node_contacts_0')),
    smoothed_ratio=(pl.col('node_contacts_1') + SMOOTHED_ALPHA) / (pl.col('node_contacts_0') + SMOOTHED_BETA),
    noisy_contacts_1 = (pl.Series(np.random.normal(0, NOISE_INJECTION, len(default_contact_ratio.join(train_nodes, on='node')))) *\
          pl.col('node_contacts_1').sqrt() + pl.col('node_contacts_1')).clip(0).round(),
).with_columns(
    noisy_bayesian_ratio_C = (pl.col('node_contacts_0').mean() * (pl.col('noisy_contacts_1').sum() / pl.col('node_contacts_0').sum()) + pl.col('noisy_contacts_1')) / (pl.col('node_contacts_0').mean() + pl.col('node_contacts_0')),
    noisy_bayesian_ratio = (BAYESSIAN_C * (pl.col('noisy_contacts_1').sum() / pl.col('node_contacts_0').sum()) + pl.col('noisy_contacts_1')) / (BAYESSIAN_C + pl.col('node_contacts_0')),
    noisy_smoothed_ratio = (pl.col('noisy_contacts_1') + SMOOTHED_ALPHA) / (pl.col('node_contacts_0') + SMOOTHED_BETA),
)

try:
    node_ratio = node_ratio.join(pl.DataFrame(
        {'node':list(enc.item_id_dict.keys()),
        'le_node':list(enc.item_id_dict.values())}),
        on='node')
except Exception:
    node_ratioc = node_ratio.drop('le_node').join(pl.DataFrame(
        {'node':list(enc.item_id_dict.keys()),
        'le_node':list(enc.item_id_dict.values())}),
        on='node')

  default_contact_ratio =(df_train


In [6]:
train_sum = train_sum.join(node_ratio.select('le_node', RATIO_COLUMN).rename({RATIO_COLUMN:'ratio_column'}) , on='le_node')

In [7]:
x_features = ['cnt','is_contact', 'n_days_clicks','n_days_contacts', 'exp_pos', 'exp_time', 'ratio_column']

x_dict = {}
for feature in x_features:
    x_dict[feature] = convert(train_sum, col=feature, enc=enc)

In [9]:
rdlae = RDLAE()
rdlae.fit((X>0).astype(np.float32))

In [10]:
weights = node_ratio.select(RATIO_COLUMN).to_numpy().reshape(-1)
weights = np.log1p(weights).astype(np.float32)

In [None]:
Gt = truncate(rdlae.G.T, k=300) 

In [None]:
recommendations_df = process_batch_w_weight(
    enc_eval_users=enc_eval_users,
    G = (rdlae.G),
    X = ((X>0) + 2*(x_dict['is_contact']>0)).astype(np.float32),
    Gt = Gt,
    features_dict=x_dict,
    weights=(weights).astype(np.float32),
    k=300,
    batch_size=10_000,
    fill_value=-1000 ,
    use_torch = True
)

In [24]:
rdlae_recs = enc.inverse_transform(recommendations_df) 
print('UNSEEN-RECALL@40', recall_at(df_eval, rdlae_recs.with_columns(pl.col('cookie').cast(pl.Int64), pl.col('node').cast(pl.UInt32)), k=40))
print('UNSEEN-RECALL@100', recall_at(df_eval, rdlae_recs.with_columns(pl.col('cookie').cast(pl.Int64), pl.col('node').cast(pl.UInt32)), k=100))

UNSEEN-RECALL@40 0.19937169213163403
UNSEEN-RECALL@100 0.31355099952500437


In [None]:
os.makedirs(os.path.join(PREDICTION_PATH, MODEL_NAME), exist_ok=True)
rdlae_recs.write_parquet(os.path.join(PREDICTION_PATH, MODEL_NAME, f'{MODEL_NAME}_VAL.pq'))

## SasRec Replay

### SasRec over node

In [1]:
import polars as pl
import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
import torch
import os

from replay.metrics import OfflineMetrics, Recall, Precision, MAP, NDCG, HitRate, MRR
from replay.metrics.torch_metrics_builder import metrics_to_df
from replay.splitters import LastNSplitter
from replay.data import (
    FeatureHint,
    FeatureInfo,
    FeatureSchema,
    FeatureSource,
    FeatureType,
    Dataset,
)
from replay.models.nn.optimizer_utils import FatOptimizerFactory
from replay.models.nn.sequential.callbacks import (
    ValidationMetricsCallback,
    SparkPredictionCallback,
    PandasPredictionCallback,
    TorchPredictionCallback,
    QueryEmbeddingsPredictionCallback,
)
from replay.models.nn.sequential.postprocessors import RemoveSeenItems
from replay.data.nn import SequenceTokenizer, SequentialDataset, TensorFeatureSource, TensorSchema, TensorFeatureInfo
from replay.models.nn.sequential import SasRec
from replay.models.nn.sequential.sasrec import (
    SasRecPredictionDataset,
    SasRecTrainingDataset,
    SasRecValidationDataset,
    SasRecPredictionBatch,
    SasRecModel,
)
import pandas as pd
import polars as pl

DATA_DIR = 'data'
VAL_DIR = 'val'


df_clickstream = pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'clickstream.pq'))
df_eval = pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'gt.pq')).join(df_clickstream, on='cookie', how='semi')
df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq'))
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

df_train = df_clickstream
eval_users = df_eval['cookie'].unique().to_list()

n_nodes = 30_000
train_small = df_train.join(df_train.unique(subset=['node', 'cookie']).select('node').group_by('node').len().sort('len').tail(n_nodes).drop('len'),
                        on='node')
df_eval_small = df_eval.join(df_train, on='cookie', how='semi')
df_eval_small = df_eval_small.with_columns(pl.col('node').cast(pl.Int64))

In [2]:
def prepare_feature_schema(is_ground_truth: bool) -> FeatureSchema:
    base_features = FeatureSchema(
        [
            FeatureInfo(
                column="user_id",
                feature_hint=FeatureHint.QUERY_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
            FeatureInfo(
                column="item_id",
                feature_hint=FeatureHint.ITEM_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
        ]
    )
    if is_ground_truth:
        return base_features

    all_features = base_features + FeatureSchema(
        [
            FeatureInfo(
                column="timestamp",
                feature_type=FeatureType.NUMERICAL,
                feature_hint=FeatureHint.TIMESTAMP,
            ),
        ]
    )

    return all_features


def filter_n_count(df, column='item_id', n=5):
    to_save = df.group_by(column).agg(pl.count().alias('count')).filter(pl.col('count')>n)
    return df.join(to_save, on=column, how='semi')

def allign_gt(df, gt, item_col='item_id', user_col='user_id'):
    gt = gt.join(df, on=user_col, how='semi')
    gt = gt.join(df, on=item_col, how='semi')
    return gt

def make_replay_format(df, is_gt=False):
    df = df.with_columns(
        [
            pl.col('cookie').alias('user_id').cast(pl.Int64), 
            pl.col('node').alias('item_id').cast(pl.Int64)
        ])
    if not is_gt:
        df = df.with_columns(pl.col("event_date").dt.timestamp("ms").alias("timestamp") // 1000)
    return df

validation_gt = make_replay_format(df_eval_small, is_gt=True)['user_id', 'item_id']
train_events = make_replay_format(train_small)['user_id', 'item_id','timestamp']
item_before = train_events['item_id'].unique().len()
item_after = train_events['item_id'].unique().len()

print(f'Before: {item_before}')
print(f'After: {item_after}')
print(f'Items save {100 * item_after/item_before} %')
validation_gt = allign_gt(train_events, validation_gt)
user_features = train_events[['user_id']].unique()
item_features = train_events[['item_id']].unique()
validation_events = train_events

Before: 30000
After: 30000
Items save 100.0 %


In [None]:
MAX_SEQ_LEN = 100
BATCH_SIZE = 128
NUM_WORKERS = 9
MAX_EPOCHS = 10

train_dataset = Dataset(
    feature_schema=prepare_feature_schema(is_ground_truth=False),
    interactions=train_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)

validation_dataset = Dataset(
    feature_schema=prepare_feature_schema(is_ground_truth=False),
    interactions=validation_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)
validation_gt = Dataset(
    feature_schema=prepare_feature_schema(is_ground_truth=True),
    interactions=validation_gt,
    check_consistency=True,
    categorical_encoded=False,
)

ITEM_FEATURE_NAME = "item_id_seq"

tensor_schema = TensorSchema(
    [
        TensorFeatureInfo(
            name=ITEM_FEATURE_NAME,
            is_seq=True,
            feature_type=FeatureType.CATEGORICAL,
            feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, train_dataset.feature_schema.item_id_column)],
            feature_hint=FeatureHint.ITEM_ID,
        )
    ]
)

tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer.fit(train_dataset)

sequential_train_dataset = tokenizer.transform(train_dataset)

sequential_validation_dataset = tokenizer.transform(validation_dataset)
sequential_validation_gt = tokenizer.transform(validation_gt, [tensor_schema.item_id_feature_name])

sequential_validation_dataset, sequential_validation_gt = SequentialDataset.keep_common_query_ids(
    sequential_validation_dataset, sequential_validation_gt
)

model = SasRec(
    tensor_schema,
    block_count=2,
    head_count=2,
    max_seq_len=MAX_SEQ_LEN,
    hidden_size=128,
    dropout_rate=0.3,
    optimizer_factory=FatOptimizerFactory(learning_rate=0.001),
    loss_sample_count=4_000,
    negatives_sharing=True
)

csv_logger = CSVLogger(save_dir=".logs/train", name="SASRec")

checkpoint_callback = ModelCheckpoint(
    dirpath=".checkpoints/sasrec_replay",
    save_top_k=1,
    verbose=True,
    monitor="recall@40",
    mode="max",
)

train_dataloader = DataLoader(
    dataset=SasRecTrainingDataset(
        sequential_train_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

validation_dataloader = DataLoader(
    dataset=SasRecValidationDataset(
        sequential_validation_dataset,
        sequential_validation_gt,
        sequential_train_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

validation_metrics_callback = ValidationMetricsCallback(
    metrics=["recall", 
             # "ndcg", "map", "coverage"
             ],
    ks=[20, 40, 100],
    item_count=train_dataset.item_count,
    postprocessors=[RemoveSeenItems(sequential_validation_dataset)],
)

trainer = L.Trainer(
    max_epochs=MAX_EPOCHS,
    callbacks=[checkpoint_callback, validation_metrics_callback],
    logger=csv_logger,
    accelerator='cuda',
)

trainer.fit(
    model,
    train_dataloaders=train_dataloader,
    val_dataloaders=validation_dataloader,
)

  dataset=SasRecTrainingDataset(
  self._inner = TorchSequentialDataset(
  dataset=SasRecValidationDataset(
  self._inner = TorchSequentialValidationDataset(
  self._inner = TorchSequentialDataset(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory C:\code\avito_hack_clear\.checkpoints\sasrec_replay exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type             | Params | Mode 
--------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 989: 'recall@40' reached 0.14474 (best 0.14474), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_replay\\epoch=0-step=989.ckpt' as top 1


k            100        20        40
recall  0.234696  0.097424  0.144743



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 1978: 'recall@40' reached 0.15222 (best 0.15222), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_replay\\epoch=1-step=1978.ckpt' as top 1


k            100        20        40
recall  0.246902  0.101545  0.152224



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 2967: 'recall@40' reached 0.15333 (best 0.15333), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_replay\\epoch=2-step=2967.ckpt' as top 1


k            100        20        40
recall  0.248557  0.102813  0.153327



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 3956: 'recall@40' reached 0.15561 (best 0.15561), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_replay\\epoch=3-step=3956-v1.ckpt' as top 1


k            100       20       40
recall  0.254591  0.10464  0.15561



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 4945: 'recall@40' reached 0.15818 (best 0.15818), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_replay\\epoch=4-step=4945.ckpt' as top 1


k            100        20        40
recall  0.258511  0.104967  0.158176



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 5934: 'recall@40' reached 0.16291 (best 0.16291), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_replay\\epoch=5-step=5934.ckpt' as top 1


k            100        20        40
recall  0.266703  0.108921  0.162912



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 6923: 'recall@40' was not in top 1


k           100        20        40
recall  0.26048  0.104784  0.158601



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 7912: 'recall@40' was not in top 1


k            100        20        40
recall  0.263805  0.107246  0.161621



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 8901: 'recall@40' was not in top 1


k            100       20        40
recall  0.264808  0.10683  0.161539



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 9890: 'recall@40' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


k            100        20        40
recall  0.261147  0.105352  0.159423



In [10]:
best_model = SasRec.load_from_checkpoint(checkpoint_callback.best_model_path).eval()

PREDICTION_PATH = 'predictions'
MODEL_NAME = 'SASREC_REPLAY_NODE'

prediction_dataloader = DataLoader(
    dataset=SasRecPredictionDataset(
        sequential_validation_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

csv_logger = CSVLogger(save_dir=".logs/test", name="SASRec")

TOPK = [300]

postprocessors = [RemoveSeenItems(sequential_validation_dataset)]

pandas_prediction_callback = PandasPredictionCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)


trainer = L.Trainer(
    callbacks=[
        pandas_prediction_callback,
    ],
    logger=csv_logger,
    inference_mode=True,
)
trainer.predict(best_model, dataloaders=prediction_dataloader, return_predictions=False)

pandas_res = pandas_prediction_callback.get_result()
recommendations = tokenizer.query_and_item_id_encoder.inverse_transform(pandas_res)
recommendations = pl.from_pandas(recommendations)

os.makedirs(os.path.join(PREDICTION_PATH, MODEL_NAME), exist_ok=True)
recommendations.rename({'user_id':'cookie', 'item_id':'node'}).write_parquet(os.path.join(PREDICTION_PATH, MODEL_NAME, 'SASREC_NODE_REPLAY_VAL.pq'))

  dataset=SasRecPredictionDataset(
  self._inner = TorchSequentialDataset(
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.


Predicting: |          | 0/? [00:00<?, ?it/s]

### SasRec Replay over category

In [1]:
import polars as pl
import lightning as L
import os
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
import torch

from replay.metrics import OfflineMetrics, Recall, Precision, MAP, NDCG, HitRate, MRR
from replay.metrics.torch_metrics_builder import metrics_to_df
from replay.splitters import LastNSplitter
from replay.data import (
    FeatureHint,
    FeatureInfo,
    FeatureSchema,
    FeatureSource,
    FeatureType,
    Dataset,
)
from replay.models.nn.optimizer_utils import FatOptimizerFactory
from replay.models.nn.sequential.callbacks import (
    ValidationMetricsCallback,
    SparkPredictionCallback,
    PandasPredictionCallback,
    TorchPredictionCallback,
    QueryEmbeddingsPredictionCallback,
)
from replay.models.nn.sequential.postprocessors import RemoveSeenItems
from replay.data.nn import SequenceTokenizer, SequentialDataset, TensorFeatureSource, TensorSchema, TensorFeatureInfo
from replay.models.nn.sequential import SasRec
from replay.models.nn.sequential.sasrec import (
    SasRecPredictionDataset,
    SasRecTrainingDataset,
    SasRecValidationDataset,
    SasRecPredictionBatch,
    SasRecModel,
)
import pandas as pd
import polars as pl

DATA_DIR = 'data'
VAL_DIR = 'val'


df_clickstream = pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'clickstream.pq'))
df_eval = pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'gt.pq')).join(df_clickstream, on='cookie', how='semi')
df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq'))
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

df_train = df_clickstream
eval_users = df_eval['cookie'].unique().to_list()

n_nodes = 30_000
train_small = df_train.join(df_train.unique(subset=['node', 'cookie']).select('node').group_by('node').len().sort('len').tail(n_nodes).drop('len'),
                        on='node')
df_eval_small = df_eval.join(df_train, on='cookie', how='semi')
df_eval_small = df_eval_small.with_columns(pl.col('node').cast(pl.Int64))

In [2]:
def prepare_feature_schema(is_ground_truth: bool) -> FeatureSchema:
    base_features = FeatureSchema(
        [
            FeatureInfo(
                column="user_id",
                feature_hint=FeatureHint.QUERY_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
            FeatureInfo(
                column="item_id",
                feature_hint=FeatureHint.ITEM_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
        ]
    )
    if is_ground_truth:
        return base_features

    all_features = base_features + FeatureSchema(
        [
            FeatureInfo(
                column="timestamp",
                feature_type=FeatureType.NUMERICAL,
                feature_hint=FeatureHint.TIMESTAMP,
            ),
        ]
    )

    return all_features


def filter_n_count(df, column='item_id', n=5):
    to_save = df.group_by(column).agg(pl.count().alias('count')).filter(pl.col('count')>n)
    return df.join(to_save, on=column, how='semi')

def allign_gt(df, gt, item_col='item_id', user_col='user_id'):
    gt = gt.join(df, on=user_col, how='semi')
    gt = gt.join(df, on=item_col, how='semi')
    return gt

def make_replay_format(df, is_gt=False):
    df = df.with_columns(
        [
            pl.col('cookie').alias('user_id').cast(pl.Int64), 
            pl.col('category').alias('item_id').cast(pl.Int64)
        ])
    if not is_gt:
        df = df.with_columns(pl.col("event_date").dt.timestamp("ms").alias("timestamp") // 1000)
    return df

validation_gt = make_replay_format(df_eval_small.join(df_cat_features.with_columns(pl.col('node').cast(pl.Int64)).drop_nulls().select('node', 'category').unique(), on='node', how='left'), is_gt=True)['user_id', 'item_id'].unique()
train_events = make_replay_format(train_small.join(df_cat_features.drop_nulls().select('node', 'category').unique(), on='node', how='left'))['user_id', 'item_id','timestamp']
item_before = train_events['item_id'].unique().len()
item_after = train_events['item_id'].unique().len()

print(f'Before: {item_before}')
print(f'After: {item_after}')
print(f'Items save {100 * item_after/item_before} %')
validation_gt = allign_gt(train_events, validation_gt)
item_features = train_events[['item_id']].unique()
validation_events = train_events

user_features = make_replay_format(df_clickstream.join(df_cat_features.drop_nulls().select('node', 'category').unique(), on='node', how='left'))[['user_id']].unique()

Before: 51
After: 51
Items save 100.0 %


In [3]:
MAX_SEQ_LEN = 100
BATCH_SIZE = 512
NUM_WORKERS = 9
MAX_EPOCHS = 5

train_dataset = Dataset(
    feature_schema=prepare_feature_schema(is_ground_truth=False),
    interactions=train_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)

validation_dataset = Dataset(
    feature_schema=prepare_feature_schema(is_ground_truth=False),
    interactions=validation_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)
validation_gt = Dataset(
    feature_schema=prepare_feature_schema(is_ground_truth=True),
    interactions=validation_gt,
    check_consistency=True,
    categorical_encoded=False,
)

ITEM_FEATURE_NAME = "item_id_seq"

tensor_schema = TensorSchema(
    [
        TensorFeatureInfo(
            name=ITEM_FEATURE_NAME,
            is_seq=True,
            feature_type=FeatureType.CATEGORICAL,
            feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, train_dataset.feature_schema.item_id_column)],
            feature_hint=FeatureHint.ITEM_ID,
        )
    ]
)

tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer.fit(train_dataset)

sequential_train_dataset = tokenizer.transform(train_dataset)

sequential_validation_dataset = tokenizer.transform(validation_dataset)
sequential_validation_gt = tokenizer.transform(validation_gt, [tensor_schema.item_id_feature_name])

sequential_validation_dataset, sequential_validation_gt = SequentialDataset.keep_common_query_ids(
    sequential_validation_dataset, sequential_validation_gt
)

model = SasRec(
    tensor_schema,
    block_count=2,
    head_count=2,
    max_seq_len=MAX_SEQ_LEN,
    hidden_size=128,
    dropout_rate=0.3,
    optimizer_factory=FatOptimizerFactory(learning_rate=0.001),
    loss_sample_count=50,
    negatives_sharing=True
)

csv_logger = CSVLogger(save_dir=".logs/train", name="SASRec_category")

checkpoint_callback = ModelCheckpoint(
    dirpath=".checkpoints/sasrec_category",
    save_top_k=1,
    verbose=True,
    monitor="ndcg@5",
    mode="max",
)

train_dataloader = DataLoader(
    dataset=SasRecTrainingDataset(
        sequential_train_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

validation_dataloader = DataLoader(
    dataset=SasRecValidationDataset(
        sequential_validation_dataset,
        sequential_validation_gt,
        sequential_train_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

validation_metrics_callback = ValidationMetricsCallback(
    metrics=["recall", 'map', 'ndcg'],
    ks=[1, 5, 10],
    item_count=train_dataset.item_count,
    postprocessors=[],
)

trainer = L.Trainer(
    max_epochs=MAX_EPOCHS,
    callbacks=[checkpoint_callback, validation_metrics_callback],
    logger=csv_logger,
    accelerator='cuda',
)

trainer.fit(
    model,
    train_dataloaders=train_dataloader,
    val_dataloaders=validation_dataloader,
)

  dataset=SasRecTrainingDataset(
  self._inner = TorchSequentialDataset(
  dataset=SasRecValidationDataset(
  self._inner = TorchSequentialValidationDataset(
  self._inner = TorchSequentialDataset(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory C:\code\avito_hack_clear\.checkpoints\sasrec_category exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type             | Params | Mode 
------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 248: 'ndcg@5' reached 0.46160 (best 0.46160), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_category\\epoch=0-step=248-v1.ckpt' as top 1


k              1        10         5
map     0.298366  0.411104  0.381566
ndcg    0.298366  0.518872  0.461601
recall  0.212957  0.765572  0.609367



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 496: 'ndcg@5' reached 0.46626 (best 0.46626), saving model to 'C:\\code\\avito_hack_clear\\.checkpoints\\sasrec_category\\epoch=1-step=496.ckpt' as top 1


k              1        10         5
map     0.304509  0.415716  0.386477
ndcg    0.304509  0.522943  0.466262
recall  0.217589  0.767836  0.613223



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 744: 'ndcg@5' was not in top 1


k              1        10         5
map     0.304745  0.415012  0.385685
ndcg    0.304745  0.522484  0.465802
recall  0.217629  0.768077  0.613735



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 992: 'ndcg@5' was not in top 1


k              1        10         5
map     0.301401  0.412559  0.382953
ndcg    0.301401  0.520390  0.463110
recall  0.214911  0.767287  0.611289



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 1240: 'ndcg@5' was not in top 1
`Trainer.fit` stopped: `max_epochs=5` reached.


k              1        10         5
map     0.298312  0.410138  0.380102
ndcg    0.298312  0.518359  0.460306
recall  0.212801  0.766803  0.608767



In [None]:
best_model = SasRec.load_from_checkpoint(checkpoint_callback.best_model_path).eval()

PREDICTION_PATH = 'predictions'
MODEL_NAME = 'SASREC_REPLAY_CATEGORY'

prediction_dataloader = DataLoader(
    dataset=SasRecPredictionDataset(
        sequential_validation_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

csv_logger = CSVLogger(save_dir=".logs/test", name="SASRec_category")

TOPK = [50]

postprocessors = []

pandas_prediction_callback = PandasPredictionCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)


trainer = L.Trainer(
    callbacks=[
        pandas_prediction_callback,
    ],
    logger=csv_logger,
    inference_mode=True,
)
trainer.predict(best_model, dataloaders=prediction_dataloader, return_predictions=False)

pandas_res = pandas_prediction_callback.get_result()
recommendations = tokenizer.query_and_item_id_encoder.inverse_transform(pandas_res)
recommendations = pl.from_pandas(recommendations)

os.makedirs(os.path.join(PREDICTION_PATH, MODEL_NAME), exist_ok=True)
recommendations.rename({'user_id':'cookie', 'item_id':'category'}).write_parquet(os.path.join(PREDICTION_PATH, MODEL_NAME, 'SASREC_CATEGORY_REPLAY_VAL.pq'))

  dataset=SasRecPredictionDataset(
  self._inner = TorchSequentialDataset(
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Grig\miniconda3\envs\my_env_py3.9\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.


Predicting: |          | 0/? [00:00<?, ?it/s]

# Features

In [16]:
import polars as pl
import os
from utils import cast_dtypes, get_dataset
DATA_DIR = 'data'
VAL_DIR = 'val'
TRAIN_DIR = 'train'
df_clickstream = pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'clickstream.pq'))

In [2]:
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))
df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq'))

def make_user_user_category(df_clickstream):
    user_features = (df_clickstream
    .join(df_event, on='event')
    .select('cookie', 'is_contact').with_columns(value=pl.lit(1)).pivot(
        values="value",
        index="cookie",
        columns="is_contact",
        aggregate_function="sum",
    ).fill_null(0)).with_columns(pl.col('0').alias('is_contact_0'),
                                pl.col('1').alias('is_contact_1'),
                                contact_ratio = (pl.col('1') / pl.col('0'))
                                ).drop('0', '1').with_columns(
                                pl.when(pl.col("contact_ratio").is_infinite())
                                .then(0)
                                .otherwise(pl.col("contact_ratio"))
                                .alias("contact_ratio")
                            )

    user_last_category_contact = (df_clickstream
                                .join(df_event, on='event')
                                .join(df_cat_features.select('item', 'category'), on='item')
                                .filter(pl.col('is_contact')>0)
                                .sort('cookie', 'event_date')
                                .unique(subset=['cookie'], keep='last')
                                .select('cookie', pl.col('category').alias('last_contact_category'))
    )

    user_last_category_seen = (df_clickstream
                                .join(df_event, on='event')
                                .join(df_cat_features.select('item', 'category'), on='item')
                                .sort('cookie', 'event_date')
                                .unique(subset=['cookie'], keep='last')
                                .select('cookie', pl.col('category').alias('last_seen_category'))
    )
    
    user_features = user_features.join(user_last_category_contact, on='cookie', how='left').join(user_last_category_seen, on='cookie', how='left')

    user_category_total = (df_clickstream
                        .join(df_event, on='event')
                        .join(df_cat_features.select('item', 'category'), on='item', how='left')
                        .group_by('cookie', 'category')
                        .agg(pl.len().alias('user_total_category_count'))
    )

    user_category_pos = (df_clickstream
                        .join(df_event, on='event')
                        .filter(pl.col('is_contact')==1)
                        .join(df_cat_features.select('item', 'category'), on='item', how='left')
                        .group_by('cookie', 'category')
                        .agg(pl.len().alias('user_pos_category_count'))
    )

    user_category = user_category_total.join(user_category_pos, on=['cookie', 'category'], how='left').fill_null(0)
    user_category = user_category.join(df_clickstream.join(df_event, on='event', how='left').group_by('cookie').agg(pl.len().alias('user_total_rows'), pl.sum('is_contact').alias('total_contacts')), on='cookie', how='left')
    user_category = user_category.with_columns(
        (pl.col('user_pos_category_count') / pl.col('user_total_category_count')).alias('user_category_contact_ratio'),
        (pl.col('user_pos_category_count') / pl.col('total_contacts')).alias('ratio_of_this_category_in_all_user_contact'),
        (pl.col('user_total_category_count') / pl.col('user_total_rows')).alias('ratio_user_category_click'))

    return user_features, user_category

def make_item_features(df_clickstream):
    df_clickstream = df_clickstream.drop_nulls()
    item_features = (df_clickstream
    .join(df_event, on='event')
    .select('node', 'is_contact')
    .with_columns(value=pl.lit(1))
    .pivot(
        values="value",
        index="node",
        columns="is_contact",
        aggregate_function="sum",
    ).fill_null(0)
    .with_columns(pl.col('0').alias('node_contact_0'),
                pl.col('1').alias('node_contact_1'),
                contact_ratio = (pl.col('1') / pl.col('0'))
                )
    .drop('0', '1')
    .with_columns(
                pl.when(pl.col("contact_ratio").is_infinite())
                .then(0)
                .otherwise(pl.col("contact_ratio"))
                .alias("node_contact_ratio")
            )
    .drop('contact_ratio')
    )

    item_features_unique = (df_clickstream
    .join(df_event, on='event')
    .select('cookie', 'node', 'is_contact', 'event_date').sort('event_date')
    .unique(subset=['cookie', 'node', 'is_contact'], keep='last')
    .select('node', 'is_contact').with_columns(value=pl.lit(1))
    .pivot(
        values="value",
        index="node",
        columns="is_contact",
        aggregate_function="sum",
    ).fill_null(0)).with_columns(pl.col('0').alias('node_contacts_0_last'),
                                pl.col('1').alias('node_contacts_1_last'),
                                contact_ratio = (pl.col('1') / pl.col('0'))
                                ).drop('0', '1').with_columns(
                                pl.when(pl.col("contact_ratio").is_infinite())
                                .then(0)
                                .otherwise(pl.col("contact_ratio"))
                                .alias("node_contact_ratio_last")
                            ).drop('contact_ratio')

    item_features_category = (df_clickstream
    .join(df_event, on='event')
    .join(df_cat_features.select('item', 'category').unique(), on='item', how='left')
    .drop_nulls()
    .select('category', 'is_contact').with_columns(value=pl.lit(1)).pivot(
        values="value",
        index="category",
        columns="is_contact",
        aggregate_function="sum",
    ).fill_null(0)).with_columns(pl.col('0').alias('category_contact_0'),
                                pl.col('1').alias('category_contact_1'),
                                contact_ratio = (pl.col('1') / pl.col('0'))
                                ).drop('0', '1').with_columns(
                                pl.when(pl.col("contact_ratio").is_infinite())
                                .then(0)
                                .otherwise(pl.col("contact_ratio"))
                                .alias("category_contact_ratio")
                            ).drop('contact_ratio')
    
    item_features_category_unique = (df_clickstream
    .join(df_event, on='event')
    .join(df_cat_features.select('item', 'category').unique(), on='item', how='left')
    .unique(subset=['cookie', 'category', 'is_contact'], keep='last')
    .drop_nulls()
    .select('category', 'is_contact').with_columns(value=pl.lit(1)).pivot(
        values="value",
        index="category",
        columns="is_contact",
        aggregate_function="sum",
    ).fill_null(0)).with_columns(pl.col('0').alias('category_contact_0_last'),
                                pl.col('1').alias('category_contact_1_last'),
                                contact_ratio = (pl.col('1') / pl.col('0'))
                                ).drop('0', '1').with_columns(
                                pl.when(pl.col("contact_ratio").is_infinite())
                                .then(0)
                                .otherwise(pl.col("contact_ratio"))
                                .alias("category_contact_ratio_last")
                            ).drop('contact_ratio')

    item_features = item_features.join(item_features_unique, on='node', how='left')
    item_features = item_features.join(df_cat_features.drop_nulls().select('node', 'category').unique(), on='node').join(item_features_category, on='category', how='left').join(item_features_category_unique, on='category', how='left')

    return item_features

In [3]:
most_pop_location = (
        df_clickstream.drop('platform', 'surface')
        .join(df_cat_features.select('item','location', 'category'), how='left', on='item')
        .group_by(['cookie', 'location'])
        .agg(pl.len().alias('len'))
        .sort('len', descending=True)
        .unique(subset=['cookie'], keep='first')
        .sort('cookie')
        .drop('len')
)

item_location = df_cat_features.group_by('node', 'location').agg(pl.len().alias('location_in_node_count'))

In [4]:
user_features, user_category = make_user_user_category(df_clickstream)
item_features = make_item_features(df_clickstream)

  user_features = (df_clickstream
  item_features = (df_clickstream
  item_features_unique = (df_clickstream
  item_features_category = (df_clickstream
  item_features_category_unique = (df_clickstream


In [None]:
sasrec_scores = (pl.read_parquet(os.path.join('predictions', 'SASREC_REPLAY_NODE', 'SASREC_NODE_REPLAY_VAL.pq'))
                .with_columns( pl.col('score').rank(descending=True).over('cookie').alias('sasrec_rank'), 
                               pl.col('score').alias('sasrec_score')).drop('score')
)

sasrec_scores_category = (pl.read_parquet(os.path.join('predictions', 'SASREC_REPLAY_CATEGORY', 'SASREC_CATEGORY_REPLAY_VAL.pq'))
                        .with_columns(pl.col('score').rank(descending=True).over('cookie').alias('sasrec_category_rank'), 
                                        pl.col('score').alias('sasrec_category_score'))
                        .drop('score')
)

ease_preds = (pl
              .read_parquet(os.path.join('predictions', 'EASE_DAN', 'EASE_DAN_val.pq'))
              .select(pl.col('rank_rd').alias('ease_rank'), pl.col('score').alias('ease_score'), 'node', 'cookie')
)

In [9]:
recs = pl.read_parquet(os.path.join('predictions', 'RDLAE', 'RDLAE_VAL.pq'))

In [10]:
tmp = cast_dtypes(recs.with_columns(pl.col('score').rank(descending=True).over('cookie').alias('rank_rd')))
tmp = tmp.join(cast_dtypes(most_pop_location), how='left', on=['cookie'])
tmp = tmp.join(cast_dtypes(sasrec_scores), on=['cookie', 'node'], how='left')
tmp = tmp.join(cast_dtypes(ease_preds), on=['cookie', 'node'], how='left')
tmp = tmp.join(cast_dtypes(item_location), how='left', on=['node', 'location'])
tmp = tmp.join(cast_dtypes(user_features), how='left', on=['cookie'])
tmp = tmp.join(cast_dtypes(item_features), how='left', on=['node'])
tmp = tmp.join(cast_dtypes(user_category), how='left', on=['cookie', 'category'])
tmp = tmp.join(cast_dtypes(sasrec_scores_category), on=['cookie', 'category'], how='left')
tmp = tmp.fill_null(0).with_columns(
    (pl.col('category') == pl.col('last_contact_category')).alias('category_same_as_last_contact').cast(pl.Int16), 
    (pl.col('category') == pl.col('last_seen_category')).alias('category_same_as_last_seen').cast(pl.Int16),
    (pl.col('node_contact_ratio') * pl.col('contact_ratio')).alias('user_node_ratio_prob'),
    (pl.col('category_contact_ratio') * pl.col('contact_ratio')).alias('user_category_ratio_prob')
)

In [17]:
os.makedirs(os.path.join(DATA_DIR, TRAIN_DIR), exist_ok=True)

In [18]:
tmp.write_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'full_train.pq'))

In [20]:
gt = pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'gt.pq'))
sampled_train = get_dataset(tmp, df_eval=cast_dtypes(gt))
sampled_train.write_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'sampled_train.pq'))

In [None]:
nodes_to_inference = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, 'clickstream.pq')).group_by('node').len().sort('len').tail(30_000))

node_in_inference = cast_dtypes(sampled_train.join(nodes_to_inference, how='semi', on='node'))
node_missed_inference = cast_dtypes(sampled_train.join(nodes_to_inference, how='anti', on='node').with_columns(pl.lit(None).alias('node').cast(pl.UInt32))) # to not train on them

sampled_train_masked = pl.concat([node_in_inference, node_missed_inference])
sampled_train_masked.write_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'sampled_train_node_masked.pq'))

# Ranker training LGBM

In [1]:
import os
import multiprocessing as mp
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from optuna import Trial, create_study
from optuna.samplers import TPESampler
from numpy import array, nan, random as np_rnd, where
from sklearn.model_selection import train_test_split
from utils import recall_at, cast_dtypes

DATA_DIR = 'data'
TRAIN_DIR = 'train'
VAL_DIR = 'val'
OPTUNA_LOGS_DIR_LGBM = '.logs/ranker/train/LGBM/'
N_TRIALS_LGBM = 2 # to check that works
train = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'sampled_train_node_masked.pq')))
all_train = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'full_train.pq')))
df_eval_val = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'gt.pq')))

In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import average_precision_score, roc_auc_score
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import polars as pl

kfold = KFold(n_splits=5)
groups = train['cookie']

cat_features = ['location', 'category', 'node', 'last_contact_category', 'last_seen_category', 'category_same_as_last_contact', 'category_same_as_last_seen'] #'last_contact_category', 'last_seen_category', 'category_same_as_last_contact', 'category_same_as_last_seen']

def objective(trial, cat_features, train_x, train_y, train_groups, valid):

        params = {
            'num_iterations': trial.suggest_int('num_iterations', 20, 500),
            'application': trial.suggest_categorical('application', ['lambdarank', 'rank_xendcg']), 
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.5),
            'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 1.0),
            'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 1.0),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 0, 5),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'seed': 42,
            'device': 'CPU',
            'verbosity': -1,
            'n_jobs': 10,
        }
        cat_features_use = set(cat_features)
        train_pool = lgb.Dataset(data=train_x,
             label=train_y, group=train_groups, categorical_feature=list(cat_features_use))

        ranker = lgb.train(params, 
                           train_pool)
        preds = ranker.predict(valid[train_x.columns],
                               num_iteration=ranker.best_iteration)
        valid['pred'] = preds

        prediction = pl.DataFrame(valid[['score', 'node', 'cookie', 'pred']])
        boost_recall = recall_at(df_eval_val['cookie', 'node'].unique().join(prediction, on='cookie', how='semi'), prediction.drop('score').rename({'pred':'score'}), k=40)
    
        return boost_recall

for fold_num, (train_u, valid_u) in enumerate(kfold.split(df_eval_val.select('cookie').unique())):
    train_u = pl.DataFrame({'cookie':train_u}).with_columns(pl.col('cookie').cast(pl.Int64))
    valid_u = pl.DataFrame({'cookie':valid_u}).with_columns(pl.col('cookie').cast(pl.Int64))
    print(f'Training fold #{fold_num + 1}')
    
    train_part = train.join(train_u, on='cookie', how='semi')
    valid_part = all_train.filter(pl.col('rank_rd')<301).join(valid_u, on='cookie', how='semi')
    
    train_part, valid_part = train_part.sort(by=['cookie']), valid_part.sort(by=['cookie'])
    train_data, train_label, train_group = train_part.drop(['cookie', 'target']).to_pandas(), train_part['target'].to_numpy().reshape(-1), train_part.group_by('cookie').len().sort('cookie')['len'].to_numpy()
    valid_data = valid_part.to_pandas()
    
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
    study.optimize(lambda trial: objective(trial, cat_features, train_data, train_label, train_group, valid_data), n_trials=N_TRIALS_LGBM, show_progress_bar=True)
    print('---'*10)
    print(f'№{fold_num}:\n{study.best_value}') 
    print(f"Лучшие гиперпараметры: {study.best_params}") 
    print('---'*10)
    os.makedirs(OPTUNA_LOGS_DIR_LGBM, exist_ok=True)
    with open(OPTUNA_LOGS_DIR_LGBM + 'logs.txt', 'a') as the_file:
        the_file.write(f"{fold_num}\n{study.best_params}\n")

Training fold #1


[I 2025-06-07 19:04:30,105] A new study created in memory with name: no-name-a9ea66b3-f96f-491d-b3ce-ca6c0eca0f7a


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-06-07 19:04:32,343] Trial 0 finished with value: 0.2004371654123786 and parameters: {'num_iterations': 210, 'application': 'rank_xendcg', 'learning_rate': 0.11186156011495693, 'num_leaves': 52, 'min_split_gain': 0.29999568707117197, 'lambda_l1': 0.11018816235014328, 'lambda_l2': 0.06024827634852292, 'feature_fraction': 0.34711443076608917, 'bagging_fraction': 0.21597659208548614, 'bagging_freq': 5, 'min_child_samples': 90}. Best is trial 0 with value: 0.2004371654123786.
[I 2025-06-07 19:04:41,587] Trial 1 finished with value: 0.20858958071244202 and parameters: {'num_iterations': 413, 'application': 'rank_xendcg', 'learning_rate': 0.021538490945681114, 'num_leaves': 71, 'min_split_gain': 0.3518466511552321, 'lambda_l1': 0.4655623474987288, 'lambda_l2': 0.9036857606013757, 'feature_fraction': 0.6855027380913005, 'bagging_fraction': 0.3895883496313851, 'bagging_freq': 1, 'min_child_samples': 79}. Best is trial 1 with value: 0.20858958071244202.
------------------------------
№0:

[I 2025-06-07 19:04:41,997] A new study created in memory with name: no-name-3cab56d0-2a83-4b8a-be21-8c38b2c68f3b


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-06-07 19:04:44,051] Trial 0 finished with value: 0.20225114405704414 and parameters: {'num_iterations': 147, 'application': 'rank_xendcg', 'learning_rate': 0.0044055447260074035, 'num_leaves': 96, 'min_split_gain': 0.19166045702191514, 'lambda_l1': 0.2814166614878487, 'lambda_l2': 0.18383393819202742, 'feature_fraction': 0.10576555562503079, 'bagging_fraction': 0.3344178341022963, 'bagging_freq': 1, 'min_child_samples': 14}. Best is trial 0 with value: 0.20225114405704414.
[I 2025-06-07 19:04:49,724] Trial 1 finished with value: 0.14678961505731422 and parameters: {'num_iterations': 99, 'application': 'lambdarank', 'learning_rate': 0.2230683198746363, 'num_leaves': 93, 'min_split_gain': 0.21806007212326106, 'lambda_l1': 0.07197371138873832, 'lambda_l2': 0.21169013939409442, 'feature_fraction': 0.6652381110229788, 'bagging_fraction': 0.38169736916112806, 'bagging_freq': 3, 'min_child_samples': 72}. Best is trial 0 with value: 0.20225114405704414.
------------------------------
№

[I 2025-06-07 19:04:50,112] A new study created in memory with name: no-name-9e457c8d-fcd5-478c-8595-b5a4e3166592


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-06-07 19:05:06,477] Trial 0 finished with value: 0.1873818111726522 and parameters: {'num_iterations': 288, 'application': 'lambdarank', 'learning_rate': 0.0018824990109750559, 'num_leaves': 62, 'min_split_gain': 0.38723196529165654, 'lambda_l1': 0.11784375551822668, 'lambda_l2': 0.9185241226449185, 'feature_fraction': 0.7467860357828763, 'bagging_fraction': 0.22086406038186657, 'bagging_freq': 3, 'min_child_samples': 14}. Best is trial 0 with value: 0.1873818111726522.
[I 2025-06-07 19:05:15,974] Trial 1 finished with value: 0.1926743347860575 and parameters: {'num_iterations': 240, 'application': 'rank_xendcg', 'learning_rate': 0.025360911427982274, 'num_leaves': 73, 'min_split_gain': 0.3049980506032946, 'lambda_l1': 0.3407588424609349, 'lambda_l2': 0.6508531281979258, 'feature_fraction': 0.8969925338164916, 'bagging_fraction': 0.4932323252127493, 'bagging_freq': 2, 'min_child_samples': 50}. Best is trial 1 with value: 0.1926743347860575.
------------------------------
№2:
0.

[I 2025-06-07 19:05:16,359] A new study created in memory with name: no-name-c4fba820-8874-4a55-a4f5-3e004b0447dd


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-06-07 19:05:43,377] Trial 0 finished with value: 0.17278459465378476 and parameters: {'num_iterations': 306, 'application': 'lambdarank', 'learning_rate': 0.06615285206771772, 'num_leaves': 98, 'min_split_gain': 0.17637240531072557, 'lambda_l1': 0.32842891870287716, 'lambda_l2': 0.5803049688087442, 'feature_fraction': 0.862244336967427, 'bagging_fraction': 0.6505528426960532, 'bagging_freq': 5, 'min_child_samples': 64}. Best is trial 0 with value: 0.17278459465378476.
[I 2025-06-07 19:05:44,470] Trial 1 finished with value: 0.20425156971108274 and parameters: {'num_iterations': 21, 'application': 'rank_xendcg', 'learning_rate': 0.006553331792581535, 'num_leaves': 29, 'min_split_gain': 0.20508677111090412, 'lambda_l1': 0.08841801128528293, 'lambda_l2': 0.36505672318774574, 'feature_fraction': 0.6407012136387936, 'bagging_fraction': 0.6391607308461834, 'bagging_freq': 5, 'min_child_samples': 53}. Best is trial 1 with value: 0.20425156971108274.
------------------------------
№3:


[I 2025-06-07 19:05:44,873] A new study created in memory with name: no-name-62736798-7817-41ef-b4a4-b6f2927a0203


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-06-07 19:05:55,456] Trial 0 finished with value: 0.19594622190336755 and parameters: {'num_iterations': 189, 'application': 'lambdarank', 'learning_rate': 0.009252792365499463, 'num_leaves': 41, 'min_split_gain': 0.0942549862027911, 'lambda_l1': 0.9716892736918116, 'lambda_l2': 0.2948104880257191, 'feature_fraction': 0.9843975843457238, 'bagging_fraction': 0.9609009839341035, 'bagging_freq': 4, 'min_child_samples': 19}. Best is trial 0 with value: 0.19594622190336755.
[I 2025-06-07 19:06:00,860] Trial 1 finished with value: 0.19563747862502007 and parameters: {'num_iterations': 375, 'application': 'rank_xendcg', 'learning_rate': 0.1362847295749121, 'num_leaves': 40, 'min_split_gain': 0.403265439548652, 'lambda_l1': 0.6090509367204264, 'lambda_l2': 0.2800898935660521, 'feature_fraction': 0.6480540571026433, 'bagging_fraction': 0.6443148373688176, 'bagging_freq': 4, 'min_child_samples': 43}. Best is trial 0 with value: 0.19594622190336755.
------------------------------
№4:
0.195

# Ranker training catboost

In [1]:
import os
import multiprocessing as mp
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from optuna import Trial, create_study
from optuna.samplers import TPESampler
from numpy import array, nan, random as np_rnd, where
from sklearn.model_selection import train_test_split
from utils import recall_at, cast_dtypes

DATA_DIR = 'data'
TRAIN_DIR = 'train'
VAL_DIR = 'val'
OPTUNA_LOGS_DIR_CBR = '.logs/ranker/train/CBR/'
N_TRIALS_CBR = 2 # to check that works
train = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'sampled_train_node_masked.pq')))
all_train = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, TRAIN_DIR, 'full_train.pq')))
df_eval_val = cast_dtypes(pl.read_parquet(os.path.join(DATA_DIR, VAL_DIR, 'gt.pq')))

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import average_precision_score, roc_auc_score
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool, CatBoostError
import optuna
import polars as pl

kfold = KFold(n_splits=5)
groups = train['cookie']

cat_features = ['location', 'category', 'node', 'last_contact_category', 'last_seen_category', 'category_same_as_last_contact', 'category_same_as_last_seen'] #'last_contact_category', 'last_seen_category', 'category_same_as_last_contact', 'category_same_as_last_seen']

def objective(trial, cat_features, train_x, train_y, train_groups, valid):

        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3), 
            'depth': trial.suggest_int('depth', 4, 8),                 
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 5), 
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli', 'No']),
            'random_strength': trial.suggest_float('random_strength', 0.5, 2.0),
            'loss_function': 'YetiRank',  
            'task_type': 'GPU',                               
            'devices': '0',                                              
            'verbose': False                                      
        }
        valid_x = valid.copy()
        cat_features_use = list(set(cat_features))
        train_pool = Pool(data=train_x, label=train_y, group_id=train_groups, cat_features=cat_features_use)
        ranker = CatBoostRanker(**params)
        ranker.fit(train_pool)
        preds = ranker.predict(valid_x[train_x.columns])
        valid_x['pred'] = preds

        prediction = pl.DataFrame(valid_x[['score', 'node', 'cookie', 'pred']])
        boost_recall = recall_at(df_eval_val['cookie', 'node'].unique().join(prediction, on='cookie', how='semi'), prediction.drop('score').rename({'pred':'score'}), k=40)
       
        return boost_recall

for fold_num, (train_u, valid_u) in enumerate(kfold.split(df_eval_val.select('cookie').unique())):
    train_u = pl.DataFrame({'cookie':train_u}).with_columns(pl.col('cookie').cast(pl.Int64))
    valid_u = pl.DataFrame({'cookie':valid_u}).with_columns(pl.col('cookie').cast(pl.Int64))
    print(f'Training fold #{fold_num + 1}')
    
    train_part = train.join(train_u, on='cookie', how='semi').fill_null(0)
    valid_part = all_train.filter(pl.col('rank_rd')<301).join(valid_u, on='cookie', how='semi')
    
    train_part, valid_part = train_part.sort(by=['cookie']), valid_part.sort(by=['cookie'])
    train_data, train_label, train_group = train_part.drop(['cookie', 'target']).to_pandas(), train_part['target'].to_numpy().reshape(-1), train_part['cookie'].to_numpy()
    valid_data = valid_part.to_pandas()
    
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
    study.optimize(lambda trial: objective(trial, cat_features, train_data, train_label, train_group, valid_data), n_trials=N_TRIALS_CBR, show_progress_bar=True)
    print('---'*10)
    print(f'№{fold_num}:\n{study.best_value}') 
    print(f"Лучшие гиперпараметры: {study.best_params}") 
    print('---'*10)
    os.makedirs(OPTUNA_LOGS_DIR_CBR, exist_ok=True)
    with open(OPTUNA_LOGS_DIR_CBR + 'logs.txt', 'a') as the_file:
        the_file.write(f"{fold_num}\n{study.best_params}\n")

Training fold #1


[I 2025-06-07 19:12:16,515] A new study created in memory with name: no-name-aab86b70-5f90-43c6-9ab9-d846f99d7f9c


  0%|          | 0/2 [00:00<?, ?it/s]

Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:12:50,974] Trial 0 finished with value: 0.21906565577917425 and parameters: {'iterations': 757, 'learning_rate': 0.24786018076230062, 'depth': 8, 'l2_leaf_reg': 4.1976262580382, 'bootstrap_type': 'No', 'random_strength': 0.7283477684968298}. Best is trial 0 with value: 0.21906565577917425.


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:13:13,709] Trial 1 finished with value: 0.21868159140302393 and parameters: {'iterations': 677, 'learning_rate': 0.09234405126308844, 'depth': 6, 'l2_leaf_reg': 2.315685472913124, 'bootstrap_type': 'No', 'random_strength': 1.3335452305004367}. Best is trial 0 with value: 0.21906565577917425.
------------------------------
№0:
0.21906565577917425
Лучшие гиперпараметры: {'iterations': 757, 'learning_rate': 0.24786018076230062, 'depth': 8, 'l2_leaf_reg': 4.1976262580382, 'bootstrap_type': 'No', 'random_strength': 0.7283477684968298}
------------------------------
Training fold #2


[I 2025-06-07 19:13:14,178] A new study created in memory with name: no-name-07c455cc-93c2-410a-a970-6181c11be419


  0%|          | 0/2 [00:00<?, ?it/s]

Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:13:32,591] Trial 0 finished with value: 0.2111534140255227 and parameters: {'iterations': 351, 'learning_rate': 0.2232014333731204, 'depth': 8, 'l2_leaf_reg': 2.7417234889530593, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.0638215123177908}. Best is trial 0 with value: 0.2111534140255227.


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:13:37,321] Trial 1 finished with value: 0.20839897763009796 and parameters: {'iterations': 108, 'learning_rate': 0.21155779689598522, 'depth': 5, 'l2_leaf_reg': 4.6885112316828, 'bootstrap_type': 'No', 'random_strength': 0.9864033320056874}. Best is trial 0 with value: 0.2111534140255227.
------------------------------
№1:
0.2111534140255227
Лучшие гиперпараметры: {'iterations': 351, 'learning_rate': 0.2232014333731204, 'depth': 8, 'l2_leaf_reg': 2.7417234889530593, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.0638215123177908}
------------------------------
Training fold #3


[I 2025-06-07 19:13:37,755] A new study created in memory with name: no-name-97dca122-6f8e-4a7d-a2e7-0bdd8abe9337


  0%|          | 0/2 [00:00<?, ?it/s]

Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:14:17,417] Trial 0 finished with value: 0.20503230175029596 and parameters: {'iterations': 905, 'learning_rate': 0.21430922888861717, 'depth': 8, 'l2_leaf_reg': 3.5278359932756085, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.114136727526179}. Best is trial 0 with value: 0.20503230175029596.


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:14:42,141] Trial 1 finished with value: 0.20260510317018177 and parameters: {'iterations': 780, 'learning_rate': 0.021594506698682375, 'depth': 5, 'l2_leaf_reg': 4.2429177244862455, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.7728296434363517}. Best is trial 0 with value: 0.20503230175029596.
------------------------------
№2:
0.20503230175029596
Лучшие гиперпараметры: {'iterations': 905, 'learning_rate': 0.21430922888861717, 'depth': 8, 'l2_leaf_reg': 3.5278359932756085, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.114136727526179}
------------------------------
Training fold #4


[I 2025-06-07 19:14:42,912] A new study created in memory with name: no-name-bf953cc5-688a-49be-adba-3747b3281030


  0%|          | 0/2 [00:00<?, ?it/s]

Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:14:55,259] Trial 0 finished with value: 0.20688562475577665 and parameters: {'iterations': 249, 'learning_rate': 0.0951981301681507, 'depth': 6, 'l2_leaf_reg': 3.7965209010349943, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.368337209693884}. Best is trial 0 with value: 0.20688562475577665.


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:15:19,601] Trial 1 finished with value: 0.21481875112307902 and parameters: {'iterations': 500, 'learning_rate': 0.25724055582778155, 'depth': 8, 'l2_leaf_reg': 4.709921291950293, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.0393258235985976}. Best is trial 1 with value: 0.21481875112307902.
------------------------------
№3:
0.21481875112307902
Лучшие гиперпараметры: {'iterations': 500, 'learning_rate': 0.25724055582778155, 'depth': 8, 'l2_leaf_reg': 4.709921291950293, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.0393258235985976}
------------------------------
Training fold #5


[I 2025-06-07 19:15:20,009] A new study created in memory with name: no-name-6a023689-43e7-4413-a6e7-9f491be5788e


  0%|          | 0/2 [00:00<?, ?it/s]

Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:16:13,535] Trial 0 finished with value: 0.2170823923088576 and parameters: {'iterations': 655, 'learning_rate': 0.2555301866365448, 'depth': 8, 'l2_leaf_reg': 3.6698045378125252, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.526537149330717}. Best is trial 0 with value: 0.2170823923088576.


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


[I 2025-06-07 19:16:45,780] Trial 1 finished with value: 0.21545684500453935 and parameters: {'iterations': 737, 'learning_rate': 0.05123550912801964, 'depth': 8, 'l2_leaf_reg': 4.229441398169865, 'bootstrap_type': 'No', 'random_strength': 0.6209315256543415}. Best is trial 0 with value: 0.2170823923088576.
------------------------------
№4:
0.2170823923088576
Лучшие гиперпараметры: {'iterations': 655, 'learning_rate': 0.2555301866365448, 'depth': 8, 'l2_leaf_reg': 3.6698045378125252, 'bootstrap_type': 'Bernoulli', 'random_strength': 1.526537149330717}
------------------------------


# Для инференса все модели тренируются заново на полном кликстриме (SasRec, SasRec over category, RDLAE, EASE_DAN) (кроме бустингов, бустинги усредняются по фолдам, лучший сабмит был чисто на катбусте)