In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-1m-dataset/users.dat
/kaggle/input/movielens-1m-dataset/ratings.dat
/kaggle/input/movielens-1m-dataset/README
/kaggle/input/movielens-1m-dataset/movies.dat


Unnamed: 0,gender,user_id,occupation,zip-code
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [11]:
df_item = pd.read_csv('/kaggle/input/movielens-1m-dataset/ratings.dat', sep='::', engine='python',
                 names=['movie_id', 'title', 'genres'])
df_item.head()

Unnamed: 0,movie_id,title,genres
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291


In [6]:
users_df = pd.read_csv('/kaggle/input/movielens-1m-dataset/users.dat',
                       header=None, 
                       sep='::', 
                       names=['UserID','Gender','Age','Occupation','Zip-code'], 
                       engine='python',
                       encoding='latin-1')


movies_df = pd.read_csv('/kaggle/input/movielens-1m-dataset/movies.dat',
                        header=None,
                        sep='::',
                        names=['MovieID', 'Title', 'Genre'], 
                        engine='python',
                        encoding='latin-1')

ratings_df = pd.read_csv('/kaggle/input/movielens-1m-dataset/ratings.dat',
                         header=None,
                         sep='::',
                         names=['UserID','MovieID','Rating','Timestamp'], 
                         engine='python',
                         encoding='latin-1')
#使用engine是因为sep为：:,如果是单字符就无所谓

In [12]:
data = pd.merge(pd.merge(ratings_df, users_df), movies_df) #自动合并，不足的行默认删除
data.head(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genre
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [13]:
#对离散向量和连续向量进行分类
user_col, item_col = "UserID", "MovieID"
sparse_features = ['UserID', 'MovieID', 'Gender', 'Age', 'Occupation', 'Zip-code', "Genre"]
dense_features = []

In [14]:
save_dir = '/kaggle/working//ml-1m/saved/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    

In [15]:

# 对SparseFeature进行LabelEncoding
from sklearn.preprocessing import LabelEncoder
print(data[sparse_features].head())
feature_max_idx = {}
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1  # 删除 0 值
    feature_max_idx[feature] = data[feature].max() + 1  # 多出来的 1 应该是为了 unseen 类别做保留，比如新商品、新用户
    if feature == user_col:  # lbe.classes_的值会随着 lbe.fit_transform 处理的数据而变化，有对应关系；leb.classes_是类属性
        user_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)}  #encode user id: raw user id
    if feature == item_col:
        item_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)}  #encode item id: raw item id

   UserID  MovieID Gender  Age  Occupation Zip-code  \
0       1     1193      F    1          10    48067   
1       1      661      F    1          10    48067   
2       1      914      F    1          10    48067   
3       1     3408      F    1          10    48067   
4       1     2355      F    1          10    48067   

                          Genre  
0                         Drama  
1  Animation|Children's|Musical  
2               Musical|Romance  
3                         Drama  
4   Animation|Children's|Comedy  


In [16]:
np.save(save_dir+"raw_id_maps.npy", (user_map, item_map))  # evaluation时会用到
print('LabelEncoding后：')
print(data[sparse_features].head())

LabelEncoding后：
   UserID  MovieID  Gender  Age  Occupation  Zip-code  Genre
0       1     1105       1    1          11      1589    240
1       1      640       1    1          11      1589    153
2       1      854       1    1          11      1589    283
3       1     3178       1    1          11      1589    240
4       1     2163       1    1          11      1589    146


In [17]:
# 定义两个塔对应哪些特征
user_cols = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]
item_cols = ['MovieID', "Genre"]

# 从data中取出相应的数据
user_profile = data[user_cols].drop_duplicates('UserID')  # 去重
item_profile = data[item_cols].drop_duplicates('MovieID')
print(user_profile.head())
print(item_profile.head())

     UserID  Gender  Age  Occupation  Zip-code
0         1       1    1          11      1589
53        2       2    7          17      2249
182       3       2    3          16      1864
233       4       2    5           8       141
254       5       2    3          21      1939
   MovieID  Genre
0     1105    240
1      640    153
2      854    283
3     3178    240
4     2163    146


In [29]:
def generate_seq_feature_match(data,
                               user_col,
                               item_col,
                               time_col,
                               item_attribute_cols=[],
                               sample_method=0,
                               mode=0,
                               neg_ratio=0,
                               min_item=0):
    """generate sequence feature and negative sample for match.

    Args:
        data (pd.DataFrame): the raw data.
        user_col (str): the col name of user_id 
        item_col (str): the col name of item_id 
        time_col (str): the col name of timestamp
        item_attribute_cols (list[str], optional): the other attribute cols of item which you want to generate sequence feature. Defaults to `[]`.
        sample_method (int, optional): the negative sample method `{
            0: "random sampling", 
            1: "popularity sampling method used in word2vec", 
            2: "popularity sampling method by `log(count+1)+1e-6`",
            3: "tencent RALM sampling"}`. 
            Defaults to 0.
        mode (int, optional): the training mode, `{0:point-wise, 1:pair-wise, 2:list-wise}`. Defaults to 0.
        neg_ratio (int, optional): negative sample ratio, >= 1. Defaults to 0.
        min_item (int, optional): the min item each user must have. Defaults to 0.

    Returns:
        pd.DataFrame: split train and test data with sequence features.
    """
    if mode == 2:  # list wise learning
        assert neg_ratio > 0, 'neg_ratio must be greater than 0 when list-wise learning'
    elif mode == 1:  # pair wise learning
        neg_ratio = 1
    print("preprocess data")
    data.sort_values(time_col, inplace=True)  #sort by time from old to new
    train_set, test_set = [], []
    n_cold_user = 0

    items_cnt = Counter(data[item_col].tolist()) #计算物品出现的次数即热门程度
    items_cnt_order = OrderedDict(sorted((items_cnt.items()), key=lambda x: x[1], reverse=True))  #item_id:item count
    neg_list = negative_sample(items_cnt_order, ratio=data.shape[0] * neg_ratio, method_id=sample_method)
    neg_idx = 0
    for uid, hist in tqdm.tqdm(data.groupby(user_col), desc='generate sequence features'):
        pos_list = hist[item_col].tolist()
        if len(pos_list) < min_item:  #drop this user when his pos items < min_item
            n_cold_user += 1
            continue

        for i in range(1, len(pos_list)):
            hist_item = pos_list[:i] #时间序列窗口
            sample = [uid, pos_list[i], hist_item, len(hist_item)]
            if len(item_attribute_cols) > 0:
                for attr_col in item_attribute_cols:  #the history of item attribute features
                    sample.append(hist[attr_col].tolist()[:i])
            if i != len(pos_list) - 1:
                if mode == 0:  #point-wise, the last col is label_col, include label 0 and 1
                    last_col = "label"
                    train_set.append(sample + [1]) #正样本
                    for _ in range(neg_ratio):
                        sample[1] = neg_list[neg_idx]
                        neg_idx += 1
                        train_set.append(sample + [0])
                elif mode == 1:  #pair-wise, the last col is neg_col, include one negative item
                    last_col = "neg_items"
                    for _ in range(neg_ratio):
                        sample_copy = copy.deepcopy(sample)
                        sample_copy.append(neg_list[neg_idx])
                        neg_idx += 1
                        train_set.append(sample_copy)
                elif mode == 2:  #list-wise, the last col is neg_col, include neg_ratio negative items
                    last_col = "neg_items"
                    sample.append(neg_list[neg_idx: neg_idx + neg_ratio])
                    neg_idx += neg_ratio
                    train_set.append(sample)
                else:
                    raise ValueError("mode should in (0,1,2)")
            else:
                test_set.append(sample + [1])  #Note: if mode=1 or 2, the label col is useless.

    random.shuffle(train_set)
    random.shuffle(test_set)

    print("n_train: %d, n_test: %d" % (len(train_set), len(test_set)))
    print("%d cold start user droped " % (n_cold_user))

    attr_hist_col = ["hist_" + col for col in item_attribute_cols]
    df_train = pd.DataFrame(train_set,
                            columns=[user_col, item_col, "hist_" + item_col, "histlen_" + item_col] + attr_hist_col + [last_col])
    df_test = pd.DataFrame(test_set,
                           columns=[user_col, item_col, "hist_" + item_col, "histlen_" + item_col] + attr_hist_col + [last_col])

    return df_train, df_test

In [30]:
def gen_model_input(df, user_profile, user_col, item_profile, item_col, seq_max_len, padding='pre', truncating='pre'):
    # merge user_profile and item_profile, pad history seuence feature
    df = pd.merge(df, user_profile, on=user_col, how='left')  # how=left to keep samples order same as the input
    df = pd.merge(df, item_profile, on=item_col, how='left')
    for col in df.columns.to_list():
        if col.startswith("hist_"):
            df[col] = pad_sequences(df[col], maxlen=seq_max_len, value=0, padding=padding, truncating=truncating).tolist()
    input_dict = df_to_dict(df)
    return input_dict

In [10]:
! pip install torch_rechub

Collecting torch_rechub
  Downloading torch_rechub-0.0.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.7.0->torch_rechub)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.7.0->torch_rechub)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.7.0->torch_rechub)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.7.0->torch_rechub)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.7.0->torch_rechub)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (fr

In [12]:
! pip install pymilvus

Collecting pymilvus
  Downloading pymilvus-2.6.1-py3-none-any.whl.metadata (6.5 kB)
Collecting protobuf>=5.27.2 (from pymilvus)
  Downloading protobuf-6.32.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting python-dotenv<2.0.0,>=1.0.1 (from pymilvus)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading pymilvus-2.6.1-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.3/254.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl (55.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading protobuf-6.32.1-cp39-abi3-manylinux2014_x86_64.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [31]:
from torch_rechub.utils.match import generate_seq_feature_match, gen_model_input
df_train, df_test = generate_seq_feature_match(data,user_col,item_col,time_col="Timestamp",item_attribute_cols=[],sample_method=1, mode=0,neg_ratio=3,min_item=0) # 该函数将在 1.5 中讲解
print(df_train.head())
print(df_test.head())

x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)  # 该函数将在 1.5 中讲解
y_train = x_train["label"]
x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)
y_test = x_test["label"]
del x_train["label"]  # 删除 y 值
del x_test["label"]

print({k: v[:3] for k, v in x_train.items()})

preprocess data


generate sequence features: 100%|██████████| 6040/6040 [00:31<00:00, 190.42it/s] 


n_train: 3952516, n_test: 6040
0 cold start user dropped 
   UserID  MovieID                                       hist_MovieID  \
0    3621      297  [2001, 108, 2985, 860, 361, 1121, 1782, 2965, ...   
1    5592     1540  [1274, 1483, 577, 1513, 2087, 1487, 3342, 3221...   
2    5519     2006  [2427, 1019, 1132, 3460, 2462, 2372, 714, 528,...   
3     637     1421  [848, 1278, 2588, 1905, 864, 1114, 955, 905, 3...   
4    1899     1442  [2657, 1107, 3176, 2415, 2856, 1140, 1260, 855...   

   histlen_MovieID  label  
0              201      0  
1               55      1  
2              240      0  
3              314      0  
4              659      1  
   UserID  MovieID                                       hist_MovieID  \
0    2202     3074  [1635, 1004, 2363, 1114, 3578, 3652, 3702, 102...   
1       8     3034  [1121, 108, 848, 3249, 467, 1446, 576, 384, 31...   
2      40     2612  [860, 2330, 511, 1121, 3024, 909, 859, 1171, 1...   
3      50     3616  [2146, 23, 842, 1828, 5

In [13]:
print(data.columns)

Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Gender', 'Age',
       'Occupation', 'Zip-code', 'Title', 'Genre'],
      dtype='object')


In [19]:
print(df_train.columns)

Index(['UserID', 'UserID', 'hist_UserID', 'histlen_UserID', 'label'], dtype='object')


In [33]:
from torch_rechub.basic.features import SparseFeature, SequenceFeature

# embed_dim 是指定 LabelEncoder 的维度，会通过训练来自动学习到合适的 Lookup table
user_features = [
    SparseFeature(feature_name, vocab_size=feature_max_idx[feature_name], embed_dim=16) for feature_name in user_cols
]
user_features += [
    SequenceFeature("hist_MovieID", vocab_size=feature_max_idx["MovieID"], embed_dim=16, pooling="mean", shared_with="MovieID") # mean pooling，会对历史观影的 embedding 做平均运算
]

item_features = [
    SparseFeature(feature_name, vocab_size=feature_max_idx[feature_name], embed_dim=16) for feature_name in item_cols
]

print(user_features[1].name)
print(user_features[1].get_embedding_layer())
print(user_features[1].get_embedding_layer()._parameters)

Gender
Embedding(3, 16)
{'weight': Parameter containing:
tensor([[-2.4859e-04,  5.0269e-06, -7.7867e-05,  1.7075e-04, -2.7757e-05,
          1.0075e-04, -9.0587e-05,  1.0367e-04, -8.2237e-05,  1.8602e-05,
         -2.9225e-05, -4.2575e-05, -5.1328e-05,  8.4033e-06,  7.4663e-05,
          1.2297e-04],
        [-2.0307e-05,  1.0396e-04, -4.5262e-05,  7.2058e-05, -1.2348e-04,
         -7.7943e-05,  8.5386e-05, -1.1952e-04,  9.4706e-05, -3.4794e-05,
         -3.1032e-05,  1.0975e-04, -2.7145e-05, -1.4672e-04,  3.7197e-05,
         -2.6488e-05],
        [ 1.3902e-04, -1.2960e-04, -1.7968e-05,  5.4971e-05, -5.2731e-05,
         -5.4153e-05,  1.2404e-04, -6.7036e-05,  1.3234e-04,  4.0145e-05,
         -3.7024e-05,  8.2626e-05, -7.0833e-05, -3.6584e-05, -6.9400e-05,
          1.2227e-04]], requires_grad=True)}


In [34]:
# 将dataframe转为dict
from torch_rechub.utils.data import df_to_dict
all_item = df_to_dict(item_profile)
test_user = x_test
print({k: v[:3] for k, v in all_item.items()})
print({k: v[0] for k, v in test_user.items()})

{'MovieID': array([1105,  640,  854]), 'Genre': array([240, 153, 283])}
{'UserID': 2202, 'MovieID': 3074, 'hist_MovieID': array([1873,  252,  310, 2204, 1694, 1174, 2558, 2308,   17,    1, 2496,
         34, 2203,  347, 3030,   11,   39, 2129,  428, 1399,  978, 2405,
        575,  137,  102,  743, 2376,  225, 2555,  363, 1716, 2470,  526,
       1458, 2502,  211, 1594,  330, 3024, 2385,  573,  800,  642, 1730,
       1945, 3178, 3280, 3556, 3076, 3006]), 'histlen_MovieID': 70, 'Gender': 1, 'Age': 1, 'Occupation': 11, 'Zip-code': 504, 'Genre': 186}


In [36]:
import torch
class DSSM(torch.nn.Module):
    """Deep Structured Semantic Model

    Args:
        user_features (list[Feature Class]): training by the user tower module.
        item_features (list[Feature Class]): training by the item tower module.
        temperature (float): temperature factor for similarity score, default to 1.0.
        user_params (dict): the params of the User Tower module, keys include:`{"dims":list, "activation":str, "dropout":float, "output_layer":bool`}.
        item_params (dict): the params of the Item Tower module, keys include:`{"dims":list, "activation":str, "dropout":float, "output_layer":bool`}.
    """

    def __init__(self, user_features, item_features, user_params, item_params, temperature=1.0):
        super().__init__()
        self.user_features = user_features
        self.item_features = item_features
        self.temperature = temperature
        self.user_dims = sum([fea.embed_dim for fea in user_features])
        self.item_dims = sum([fea.embed_dim for fea in item_features])

        self.embedding = EmbeddingLayer(user_features + item_features)
        self.user_mlp = MLP(self.user_dims, output_layer=False, **user_params)
        self.item_mlp = MLP(self.item_dims, output_layer=False, **item_params)
        self.mode = None

    def forward(self, x):
        user_embedding = self.user_tower(x)
        item_embedding = self.item_tower(x)
        if self.mode == "user":
            return user_embedding
        if self.mode == "item":
            return item_embedding

        # calculate cosine score
        y = torch.mul(user_embedding, item_embedding).sum(dim=1)
        # y = y / self.temperature
        return torch.sigmoid(y)

    def user_tower(self, x):
        if self.mode == "item":
            return None
        input_user = self.embedding(x, self.user_features, squeeze_dim=True)  #[batch_size, num_features*deep_dims]
        user_embedding = self.user_mlp(input_user)  #[batch_size, user_params["dims"][-1]]
        user_embedding = F.normalize(user_embedding, p=2, dim=1)  # L2 normalize
        return user_embedding

    def item_tower(self, x):
        if self.mode == "user":
            return None
        input_item = self.embedding(x, self.item_features, squeeze_dim=True)  #[batch_size, num_features*embed_dim]
        item_embedding = self.item_mlp(input_item)  #[batch_size, item_params["dims"][-1]]
        item_embedding = F.normalize(item_embedding, p=2, dim=1)
        return item_embedding


In [37]:
from torch_rechub.models.matching import DSSM
from torch_rechub.trainers import MatchTrainer
from torch_rechub.utils.data import MatchDataGenerator

# 根据之前处理的数据拿到Dataloader
dg = MatchDataGenerator(x=x_train, y=y_train)
train_dl, test_dl, item_dl = dg.generate_dataloader(test_user, all_item, batch_size=256)

# 定义模型
model = DSSM(user_features, item_features, temperature=0.02,  # 在归一化之后的向量计算內积之后，乘一个固定的超参 r ，论文中命名为温度系数。归一化后如果不乘 temperature，模型无法收敛
             user_params={
                 "dims": [256, 128, 64],
                 "activation": 'prelu',  # important!!
             },
             item_params={
                 "dims": [256, 128, 64],
                 "activation": 'prelu',  # important!!
             })

# 模型训练器
trainer = MatchTrainer(model,
                       mode=0,  # 同上面的mode，需保持一致
                       optimizer_params={
                           "lr": 1e-4,
                           "weight_decay": 1e-6
                       },
                       n_epoch=10,
                       device='cpu',
                       model_path=save_dir)

# 开始训练
trainer.fit(train_dl)




epoch: 0


train: 100%|██████████| 15440/15440 [07:18<00:00, 35.19it/s, loss=0.551]


epoch: 1


train: 100%|██████████| 15440/15440 [07:23<00:00, 34.80it/s, loss=0.566]


epoch: 2


train: 100%|██████████| 15440/15440 [07:16<00:00, 35.36it/s, loss=0.562]


epoch: 3


train: 100%|██████████| 15440/15440 [07:23<00:00, 34.83it/s, loss=0.548]


epoch: 4


train: 100%|██████████| 15440/15440 [07:28<00:00, 34.46it/s, loss=0.549]


epoch: 5


train: 100%|██████████| 15440/15440 [08:13<00:00, 31.31it/s, loss=0.551]


epoch: 6


train: 100%|██████████| 15440/15440 [08:11<00:00, 31.39it/s, loss=0.551]


epoch: 7


train: 100%|██████████| 15440/15440 [07:55<00:00, 32.44it/s, loss=0.552]


epoch: 8


train: 100%|██████████| 15440/15440 [08:09<00:00, 31.56it/s, loss=0.55] 


epoch: 9


train: 100%|██████████| 15440/15440 [08:05<00:00, 31.83it/s, loss=0.546]


In [38]:
import collections
import numpy as np
import pandas as pd
from torch_rechub.utils.match import Annoy
from torch_rechub.basic.metric import topk_metrics

def match_evaluation(user_embedding, item_embedding, test_user, all_item, user_col='UserID', item_col='MovieID',
                     raw_id_maps="./raw_id_maps.npy", topk=10):
    print("evaluate embedding matching on test data")
    annoy = Annoy(n_trees=10)
    annoy.fit(item_embedding)

    #for each user of test dataset, get ann search topk result
    print("matching for topk")
    user_map, item_map = np.load(raw_id_maps, allow_pickle=True)
    match_res = collections.defaultdict(dict)  # user id -> predicted item ids
    for user_id, user_emb in zip(test_user[user_col], user_embedding):
        items_idx, items_scores = annoy.query(v=user_emb, n=topk)  #the index of topk match items
        match_res[user_map[user_id]] = np.vectorize(item_map.get)(all_item[item_col][items_idx])

    #get ground truth
    print("generate ground truth")

    data = pd.DataFrame({user_col: test_user[user_col], item_col: test_user[item_col]})
    data[user_col] = data[user_col].map(user_map)
    data[item_col] = data[item_col].map(item_map)
    user_pos_item = data.groupby(user_col).agg(list).reset_index()
    ground_truth = dict(zip(user_pos_item[user_col], user_pos_item[item_col]))  # user id -> ground truth

    print("compute topk metrics")
    out = topk_metrics(y_true=ground_truth, y_pred=match_res, topKs=[topk])
    return out

user_embedding = trainer.inference_embedding(model=model, mode="user", data_loader=test_dl, model_path=save_dir)
item_embedding = trainer.inference_embedding(model=model, mode="item", data_loader=item_dl, model_path=save_dir)

match_evaluation(user_embedding, item_embedding, test_user, all_item, topk=10, raw_id_maps=save_dir+"raw_id_maps.npy")

user inference: 100%|██████████| 24/24 [00:01<00:00, 19.45it/s]
item inference: 100%|██████████| 15/15 [00:01<00:00, 14.94it/s]


evaluate embedding matching on test data
matching for topk
generate ground truth
compute topk metrics


defaultdict(list,
            {'NDCG': ['NDCG@10: 0.0144'],
             'MRR': ['MRR@10: 0.0101'],
             'Recall': ['Recall@10: 0.029'],
             'Hit': ['Hit@10: 0.029'],
             'Precision': ['Precision@10: 0.0029']})