In [1]:
ubc_data_path = '../ubc_data_relevant_splitted'
ubc_input_path = f'{ubc_data_path}/input'
target_path = f'{ubc_data_path}/target'

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import os
import sys

# Below paths: bad but managable

# for our_lib
module_path = os.path.abspath(os.path.join('./our_lib')) # or the path to your source code
print(module_path)
sys.path.append(module_path)

# for recsys2025
# module_path = os.path.abspath(os.path.dirname(os.path.abspath(__file__)).join('../../.'))
module_path = os.path.abspath(os.path.join('../.')) # or the path to your source code
sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../recsys2025')) # or the path to your source code
sys.path.append(module_path)


# from our_lib import *
from our_lib import RecSysData
# from our_lib import split_data, create_baseline_embeddings, contest_training
# from our_lib import validate_and_load_embeddings  
%reload_ext autoreload

/home/zmrocze/studia/uwr/sem2/adm/projekt/src/our_lib


# load data

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [4]:
# load everything
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torchmetrics as tm

In [5]:
from src.our_lib import RecSysData, unique_edges, load_target, drop_duplicates, all_categories_in_df_numpy, all_users_in_df_numpy, all_items_in_df_numpy, join_item_with_category

input_data = RecSysData.read_parquet(ubc_input_path)
product_properties = pd.read_parquet(f'{ubc_data_path}/product_properties.parquet')
all_input_users = input_data.all_users_set()
all_categories = all_categories_in_df_numpy(product_properties)
input_data

RecSysData(
  add_to_cart=1922253,
  page_visit=32941261,
  product_buy=939835,
  product_properties=1197634,
  remove_from_cart=800359,
  search_query=3204721,
  all users: 858489,
  all products: 674374
  all urls: 5185400,
)

In [6]:
delattr(input_data, 'page_visit')
delattr(input_data, 'search_query')

In [7]:
target_path = "../ubc_data_relevant_splitted/target"
target_data = load_target(target_path)
train_target = target_data['train_target']
valid_target = target_data['validation_target']

In [8]:
input_data.add_to_cart = join_item_with_category(input_data.add_to_cart, product_properties)
input_data.remove_from_cart = join_item_with_category(input_data.remove_from_cart, product_properties)
input_data.product_buy = join_item_with_category(input_data.product_buy, product_properties)

In [9]:
first_train_timestamp = train_target['timestamp'].min()
first_valid_timestamp = valid_target['timestamp'].min()

print(f"First timestamp in train_target: {first_train_timestamp}")
print(f"First timestamp in valid_target: {first_valid_timestamp}")

First timestamp in train_target: 2022-09-13 00:03:35
First timestamp in valid_target: 2022-09-27 00:00:15


In [10]:
print(input_data.add_to_cart['timestamp'].min())
print(input_data.remove_from_cart['timestamp'].min())
print(input_data.product_buy['timestamp'].min())
print(input_data.add_to_cart['timestamp'].max())
print(input_data.remove_from_cart['timestamp'].max())
print(input_data.product_buy['timestamp'].max())

2022-05-23 00:10:15
2022-05-23 00:12:20
2022-05-23 00:10:40
2022-09-12 23:59:45
2022-09-12 23:58:10
2022-09-12 23:58:35


In [11]:
# drop duplicates
aaxxrta = 'category' # 'sku'
drop_duplicates(input_data.add_to_cart, subset=['client_id', aaxxrta])
drop_duplicates(input_data.remove_from_cart, subset=['client_id', aaxxrta])
drop_duplicates(input_data.product_buy, subset=['client_id', aaxxrta])

drop_duplicates(train_target, subset=['client_id', aaxxrta])
drop_duplicates(valid_target, subset=['client_id', aaxxrta])

# print all the lengths
print(f"add_to_cart: {len(input_data.add_to_cart)}")
print(f"remove_from_cart: {len(input_data.remove_from_cart)}")
print(f"product_buy: {len(input_data.product_buy)}")
print(f"train_target: {len(train_target)}")
print(f"valid_target: {len(valid_target)}")

print(f"len(users) x len(categories): {len(all_input_users) * len(all_categories)}")

add_to_cart: 1922253
remove_from_cart: 800359
product_buy: 939835
train_target: 130109
valid_target: 148479
len(users) x len(categories): 5480593776


In [12]:
# input_data.print_datasets_stats_nicely("input data")

In [13]:
train_users_set = set(train_target['client_id'].unique())
valid_users_set = set(valid_target['client_id'].unique())

In [14]:
# Get users who have only one interaction across all input data types
all_interactions = pd.concat([
  input_data.add_to_cart[['client_id']],
  input_data.remove_from_cart[['client_id']],
  input_data.product_buy[['client_id']]
])

user_interaction_counts = all_interactions['client_id'].value_counts()
single_interaction_users = set(user_interaction_counts[user_interaction_counts == 1].index)
users_not_in_test = single_interaction_users.difference(valid_users_set).difference(train_users_set)

print(f"Users with single interaction in input_data: {len(single_interaction_users)}")
print(f"Users with single interaction and no interaction in test set: {len(users_not_in_test)}")

Users with single interaction in input_data: 190183
Users with single interaction and no interaction in test set: 181914


In [15]:
# removing these, no information gained
input_data.add_to_cart = input_data.add_to_cart[~input_data.add_to_cart['client_id'].isin(users_not_in_test)]
input_data.remove_from_cart = input_data.remove_from_cart[~input_data.remove_from_cart['client_id'].isin(users_not_in_test)]
input_data.product_buy = input_data.product_buy[~input_data.product_buy['client_id'].isin(users_not_in_test)]

In [16]:
# sanity check
_known_categories = set(input_data.add_to_cart['category'].unique()).union(
    set(input_data.remove_from_cart['category'].unique())).union(
    set(input_data.product_buy['category'].unique()))
known_users = set(input_data.add_to_cart['client_id'].unique()).union(
    set(input_data.remove_from_cart['client_id'].unique())).union(
    set(input_data.product_buy['client_id'].unique()))

len(_known_categories), len(known_users), len(all_categories)

(6130, 485297, 6384)

In [17]:
train_target_known = train_target[train_target['client_id'].isin(known_users)]
valid_target_known = valid_target[valid_target['client_id'].isin(known_users)]
print(f"train_target_known: {len(train_target_known)}")
print(f"valid_target_known: {len(valid_target_known)}")

train_target_known: 60108
valid_target_known: 63112


# training gat

In [18]:
# imports
# from src.our_lib import NodeIdMap, JustGAT, RecGAT, DotproductEdgePredictor, LinearEdgePredictor, BprLossLoader
# from src.our_lib import test_out, l2_reg, train_test_split_pos_edges, int_tensor, unique_edges
from src.our_lib import NodeIdMap, JustGAT, RecGAT, DotproductEdgePredictor, LinearEdgePredictor, BprLossLoader, test_out, l2_reg, train_test_split_pos_edges, int_tensor, unique_edges, loss_f, create_target_from_edge_index, BprTraining
import random

In [19]:
### NOTE!:
### There are 2 types of ids: product has sku    and the id of a product node in graph (can exchange with node_id_map.item_of_id/id_of_item).
#                             user has client_id and the id of a user    node in graph

node_id_map = NodeIdMap(known_users, all_categories)
train_edge_index = node_id_map.make_edges(train_target_known.client_id.values, train_target_known.category.values).to(device=device)
val_edge_index = node_id_map.make_edges(valid_target_known.client_id.values, valid_target_known.category.values).to(device=device)

N = node_id_map.N

In [20]:
# TODO: take negatives from batch items, not from all
neg_samples = 5
batch_size = 16
### these are edges used for loss calculation
### and they define the graph (maybe together with product_buy, remove_from_cart)
train_loader = BprLossLoader(
    edge_index=train_edge_index,
    trg_index_range=(node_id_map.n_users, node_id_map.N), # node index range for all items (used to sample negative items)
    batch_size=batch_size,
    neg_samples=neg_samples,
    random_state=42,
    device=device
)

test_loader = BprLossLoader(
  edge_index=val_edge_index,
  trg_index_range=(node_id_map.n_users, node_id_map.N),  # node index range for all items (used to sample negative items)
  batch_size=batch_size,
  neg_samples=neg_samples,
  random_state=42,
  device=device
)

In [21]:
embedding_dim = 16
edge_dim = 3
num_layers=3
dropout_gat=0.5
init_type='normal'
init_a=0.001
dropout_edge=0.5
lr=0.001
l2_reg=0.0001
accumulate_grad_batches=10
patience=5
lr_reduce_factor=0.5
lr_scheduler_monitor="train_loss"
auroc_batch_size=256
max_epochs=50

def make_model():
  ### define graph
  # new_recgat = lambda: RecGAT(users, items, embedding_dim=embedding_dim, edge_dim=edge_dim, num_layers=2, device=device, dropout=0.3, type='uniform', a=0.0000001)
  new_recgat = lambda: RecGAT(known_users, all_categories, embedding_dim=embedding_dim, edge_dim=edge_dim, num_layers=num_layers, device=device, dropout=dropout_gat, type=init_type, a=init_a)
  recgat = new_recgat()
  
  edge_attr_add_to_cart = torch.tensor([[1, 0, 0]], dtype=torch.float).repeat(len(input_data.add_to_cart), 1).to(device)
  edge_attr_remove_from_cart = torch.tensor([[0, 1, 0]], dtype=torch.float).repeat(len(input_data.remove_from_cart), 1).to(device)
  edge_attr_product_buy = torch.tensor([[0, 0, 1]], dtype=torch.float).repeat(len(input_data.product_buy), 1).to(device)

  recgat.add_edges_from_user_category_df(input_data.add_to_cart, edge_attr=edge_attr_add_to_cart)
  recgat.add_edges_from_user_category_df(input_data.remove_from_cart, edge_attr=edge_attr_remove_from_cart)
  recgat.add_edges_from_user_category_df(input_data.product_buy, edge_attr=edge_attr_product_buy)
  
  ### edge predictor (here: only for add_to_cart edges. todo: predict more stuff (like next purchased category in next 14days from propensity categories) with more varied predictors and combine losses)
  edge_predictor = LinearEdgePredictor(embedding_dim=embedding_dim, dropout=dropout_edge).to(device=device)

  model = BprTraining(
      recgat=recgat,
      edge_predictor=edge_predictor,
      lr=lr,
      l2_reg=l2_reg,
      # propensity_sku=sampled_items, # for auroc calc
      auroc_batch_size=auroc_batch_size,
      val_edge_index=val_edge_index,
      patience=patience,
      factor=lr_reduce_factor,
      lr_scheduler_monitor=lr_scheduler_monitor,
      # forward_gat_every_n = 20, ### tradeoff : speed vs training stability
      # this has to be fixed and work with n>1, this is slow, use big batch to make up for it:
      # forward_gat_every_n = accumulate_grad_batches,
    )

  ### sillynes
  # model = remake_model_if_loss_nan(model)
  return model


In [None]:
# md = make_model()
# md.auroc()

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fd062f8ac60>>
Traceback (most recent call last):
  File "/home/zmrocze/studia/uwr/sem2/adm/projekt/.mamba/envs/my-mamba-environment/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger

model_checkpoint_val = ModelCheckpoint(
    monitor='val_loss',
    auto_insert_metric_name=True,
    save_top_k=2,
    mode='min',
)
model_checkpoint_auroc = ModelCheckpoint(
  monitor='val_auroc',
  auto_insert_metric_name=True,
  mode = 'max',
  save_top_k=2,
)

# lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau()

# model = remake_model_if_loss_nan(make_model)
model = make_model()

log_model = True
trial_k = random.randint(0, 100000)
wandb_logger = WandbLogger(
  # project="recsys_categories",
  project="recsys2025",
  name=f"recgat_bpr_{trial_k}",
  log_model=True
  )
wandb_logger.experiment.config.update({
  "embedding_dim": embedding_dim,
  "edge_dim": edge_dim,
  "num_layers": num_layers,
  "dropout_gat": dropout_gat,
  "init_type": init_type,
  "init_a": init_a,
  "dropout_edge": dropout_edge,
  "lr": lr,
  "l2_reg": l2_reg,
  "batch_size": batch_size,
  "auroc_batch_size": auroc_batch_size,
  "accumulate_grad_batches": accumulate_grad_batches,
  "neg_samples": neg_samples,
  "log_model": log_model,
  "graph_items": model.recgat.node_id_map.n_items,
  "graph_users": model.recgat.node_id_map.n_users,
  "graph_edges_n": model.recgat.edge_index.size(1)
})

trainer = pl.Trainer(  max_epochs=max_epochs
                     , check_val_every_n_epoch=accumulate_grad_batches
                     , logger=wandb_logger
                     , callbacks=[model_checkpoint_auroc, model_checkpoint_val, ]
                     , accumulate_grad_batches=accumulate_grad_batches
                    #  , terminate_on_nan=True
                     )

# print("test_loader", test_loader)
# print("test_edges.shape", test_edges.shape)
wandb_logger.watch(model, log="all")

trainer.fit(model, train_loader, val_dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Currently logged in as: [33mzmrocze[0m ([33mzmrocze-uniwroc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type                | Params | Mode 
---------------------------------------------------------------
0 | recgat         | RecGAT              | 50.4 M | train
1 | edge_predictor | LinearEdgePredictor | 32.9 K | train
---------------------------------------------------------------
50.4 M    Trainable params
0         Non-trainable params
50.4 M    Total params
201.717   Total estimated model params size (MB)
32        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
# model.lr = 0.0001