## Update scipy.sparse API of the public datasets

to clear the warnings

In [6]:
import pickle
from scipy.sparse import coo_matrix

def redump(pkl_path: str):
    with open(pkl_path, "rb") as f:
        data = pickle.load(f)
    coo = coo_matrix(data)
    with open(pkl_path.replace(".pkl", "_new.pkl"), "wb") as f:
        pickle.dump(coo, f)

In [7]:
redump("datasets/tiktok/trnMat.pkl")
redump("datasets/tiktok/valMat.pkl")
redump("datasets/tiktok/tstMat.pkl")

In [8]:
redump("datasets/sports/trnMat.pkl")
redump("datasets/sports/valMat.pkl")
redump("datasets/sports/tstMat.pkl")

  data = pickle.load(f)


## Show data statistics

In [1]:
import pickle
from scipy.sparse import coo_matrix
import numpy as np

def show_data(data_path: str):
    with open(data_path, "rb") as f:
        data = pickle.load(f)
    format_str = f"type: {type(data)}, shape: {data.shape}, nnz: {data.nnz}"
    print(format_str)

def show_feats(feats_path: str):
    feats = np.load(feats_path)
    format_str = f"type: {type(feats)}, shape: {feats.shape}"
    print(format_str)

In [2]:
datasets = ["tiktok", "yelp", "sports"]
for ds in datasets:
    print(f"Dataset: {ds}")
    show_data(f"datasets/{ds}/trnMat.pkl")
    show_data(f"datasets/{ds}/valMat.pkl")
    show_data(f"datasets/{ds}/tstMat.pkl")
    print()

Dataset: tiktok
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (9308, 6710), nnz: 59541
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (9308, 6710), nnz: 3051
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (9308, 6710), nnz: 6130

Dataset: yelp
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (37397, 32491), nnz: 165008
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (37397, 32491), nnz: 47147
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (37397, 32491), nnz: 23580

Dataset: sports
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (35598, 18357), nnz: 218409
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (35598, 18357), nnz: 40029
type: <class 'scipy.sparse._coo.coo_matrix'>, shape: (35598, 18357), nnz: 37899



In [4]:
datasets = ["tiktok", "yelp", "sports"]
for ds in datasets:
    print(f"Dataset: {ds}")
    if ds == "tiktok":
        show_feats(f"datasets/{ds}/image_feat.npy")
        show_feats(f"datasets/{ds}/text_feat.npy")
        show_feats(f"datasets/{ds}/audio_feat.npy")
        print()
        continue
    show_feats(f"datasets/{ds}/image_feat.npy")
    show_feats(f"datasets/{ds}/text_feat.npy")
    print()

Dataset: tiktok
type: <class 'numpy.ndarray'>, shape: (6710, 128)
type: <class 'numpy.ndarray'>, shape: (6710, 768)
type: <class 'numpy.ndarray'>, shape: (6710, 128)

Dataset: yelp
type: <class 'numpy.ndarray'>, shape: (32491, 512)
type: <class 'numpy.ndarray'>, shape: (32491, 1024)

Dataset: sports
type: <class 'numpy.ndarray'>, shape: (18357, 4096)
type: <class 'numpy.ndarray'>, shape: (18357, 1024)



## Reproduction

Extract the test function from main code. Keep the same seed and load the trained user/item embeddings to reproduce.

In [1]:
from safetensors.torch import load_file
import torch
from torch import Tensor
import torch.utils.data as dataloader
from utils import set_seed, cal_metrics
from utils.conf import load_config
from data import DataHandler

def predict(config_path, tensor_path) -> None:
    config = load_config(config_path)
    set_seed(config.base.seed)
    handler = DataHandler(config)
    handler.load_data()
    test_loader = dataloader.DataLoader(handler.test_data, batch_size=config.train.batch, shuffle=False, num_workers=0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    embs = load_file(tensor_path, device="cuda")
    user_embs = embs['user']
    item_embs = embs['item']

    epoch_recall, epch_ndcg, epoch_precision = [0] * 3
    i = 0
    data_length = len(handler.test_data)

    for usr, mask in test_loader:
        i += 1
        usr: Tensor = usr.long().cuda(device)
        mask: Tensor = mask.cuda(device)
        predict = torch.mm(user_embs[usr], torch.transpose(item_embs, 1, 0)) * (1 - mask) - mask * 1e8
        _, top_idxs = torch.topk(predict, config.base.topk)
        recall, ndcg, precision = cal_metrics(config.base.topk, top_idxs.cpu().numpy(), handler.test_data.test_user_its, usr)
        epoch_recall += recall
        epch_ndcg += ndcg
        epoch_precision += precision
    result = dict()
    result['Recall'] = epoch_recall / data_length
    result['NDCG'] = epch_ndcg / data_length
    result['Precision'] = epoch_precision / data_length
    print(result)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_path = "conf/sports.toml"
tensor_path = "persist/sports/2025-05-18_16-01-52_embs.safetensors"
predict(config_path, tensor_path)

{'Recall': 0.10834955709808229, 'NDCG': 0.049213763468739694, 'Precision': 0.005725040732625417}
