# Data Split

In [None]:
from datetime import timedelta
import polars as pl
import os
DATA_DIR = 'data'
SAVE_PATH = os.path.join(DATA_DIR, 'val')
os.makedirs(os.path.join(DATA_DIR, 'val'), exist_ok=True)
EVAL_DAYS_TRESHOLD = 14

df_test_users = pl.read_parquet(os.path.join(DATA_DIR, 'test_users.pq'))
df_clickstream = pl.read_parquet(os.path.join(DATA_DIR, 'clickstream.pq'))

#df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq'))
#df_text_features = pl.read_parquet(os.path.join(DATA_DIR, 'clickstream.pq'))
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

In [6]:
treshhold = df_clickstream['event_date'].max() - timedelta(days=EVAL_DAYS_TRESHOLD)

df_train = df_clickstream.filter(df_clickstream['event_date']<= treshhold)
df_eval = df_clickstream.filter(df_clickstream['event_date']> treshhold)[['cookie', 'node', 'event']]

df_eval = (
        df_eval
        .join(df_train, on=['cookie', 'node'], how='anti')
        .filter(
                pl.col('event').is_in(
                    df_event.filter(pl.col('is_contact')==1)['event'].unique()
                )
            )
        .filter(
        pl.col('cookie').is_in(df_train['cookie'].unique())
        ).filter(
            pl.col('node').is_in(df_train['node'].unique())
        )
)
df_eval = df_eval.unique(['cookie', 'node'])

In [13]:
df_train.write_parquet(os.path.join(SAVE_PATH, 'clickstream.pq'))
df_eval.write_parquet(os.path.join(SAVE_PATH, 'gt.pq'))

# Retrieval Train

## Autoencoders

### EASE_DAN

In [1]:
from autoencoders.model import EASE_DAN
from utils import Enc, convert_to_sparse, process_in_batches, recall_at
import polars as pl
import os
import numpy as np

DATA_DIR = 'data'
VAL_PATH = os.path.join(DATA_DIR, 'val')
PREDICTION_PATH = 'predictions'
MODEL_NAME = 'EASE_DAN'
EVAL_DAYS_TRESHOLD = 14
N_ITEMS = 30_000

df_clickstream = pl.read_parquet(os.path.join(VAL_PATH, 'clickstream.pq'))
df_eval = pl.read_parquet(os.path.join(VAL_PATH, 'gt.pq')).join(df_clickstream, on='cookie', how='semi')
df_cat_features = pl.read_parquet(os.path.join(DATA_DIR, 'cat_features.pq')) 
df_event = pl.read_parquet(os.path.join(DATA_DIR, 'events.pq'))

df_train = df_clickstream
eval_users = df_eval['cookie'].unique().to_list()

In [2]:
enc = Enc(item_key='node', user_key='cookie')
enc_eval_users = [enc.user_id_dict.get(i) for i in eval_users]

df_train = df_train.join(df_train.unique(subset=['node', 'cookie']).select('node').group_by('node').len().sort('len').tail(N_ITEMS).drop('len'),
                    on='node')


df_train = df_train.with_columns(
        (pl.lit(1)).alias("event_weight")
    )
df_eval = df_eval.join(df_train, on='cookie', how='semi')
df_eval = df_eval.with_columns(pl.col('node').cast(pl.Int64))
result = enc.fit(train_df=df_train, event_weight='event_weight')
enc_eval_users = [enc.user_id_dict.get(i) for i in eval_users]


X = (convert_to_sparse(result, enc) > 0).astype(np.float32)
ease = EASE_DAN(num_items=N_ITEMS)
ease.fit(X)
recommendations_df = process_in_batches(
    enc_eval_users=enc_eval_users,
    X=X,
    G=ease.W,
    k=300, # top_k
    batch_size=1000,
    fill_value=-1000
)
recs = enc.inverse_transform(recommendations_df) 
recs = recs.with_columns(
    pl.col('score').rank(descending=True).over('cookie').alias(f'rank_rd'),
    pl.col('cookie').cast(pl.Int64),
    pl.col('node').cast(pl.Int64)
)
print('UNSEEN-RECALL@40', recall_at(df_eval, recs, k=40))
print('UNSEEN-RECALL@100', recall_at(df_eval, recs, k=100))

UNSEEN-RECALL@40 0.17348943701276365
UNSEEN-RECALL@100 0.2819826797115451


In [3]:
ease.reg_p

10

In [5]:
os.makedirs(os.path.join(PREDICTION_PATH, MODEL_NAME), exist_ok=True)

In [6]:
recs.write_parquet(os.path.join(PREDICTION_PATH, MODEL_NAME, 'EASE_DAN_val.pq'))

### RDLAE

In [None]:
from autoencoders.model import RDLAE
from utils import Enc, convert_to_sparse, process_in_batches, recall_at
import polars as pl
import os
import numpy as np

DATA_DIR = 'data'
VAL_PATH = os.path.join(DATA_DIR, 'val')
PREDICTION_PATH = 'predictions'
MODEL_NAME = 'RDLAE'
EVAL_DAYS_TRESHOLD = 14
N_ITEMS = 30_000