In [1]:
import sys
sys.path.append("..")

import os
import numpy as np
import pandas as pd
import torch

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from Tools.models.matching.dssm import DSSM
from Tools.trainers import MatchTrainer
from Tools.basic.features import DenseFeature, SparseFeature, SequenceFeature
from Tools.utils.match import generate_seq_feature_match, gen_model_input
from Tools.utils.data import df_to_dict, MatchDataGenerator
from movielens_utils import match_evaluation
from DSSM.data import get_movielens_data

dataset_path="../data/ml-1m.csv"
model_name='dssm'
epoch=10
learning_rate=1e-4
batch_size=2048
weight_decay=1e-6
device='cpu'
save_dir='../data/saved/'
seed=2022

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
torch.manual_seed(seed)
user_features, item_features, x_train, y_train, all_item, test_user = get_movielens_data(dataset_path)
dg = MatchDataGenerator(x=x_train, y=y_train)

preprocess data


generate sequence features: 100%|███████████████████████████████████████████████| 6040/6040 [00:08<00:00, 691.07it/s]


n_train: 3952516, n_test: 6040
0 cold start user droped 


In [3]:
model = DSSM(user_features,
                 item_features,
                 temperature=0.02,
                 user_params={
                     "dims": [256, 128, 64],
                     "activation": 'prelu',  # important!!
                 },
                 item_params={
                     "dims": [256, 128, 64],
                     "activation": 'prelu',  # important!!
                 })

trainer = MatchTrainer(model,
                       mode=0,
                       optimizer_params={
                           "lr": learning_rate,
                           "weight_decay": weight_decay
                       },
                       n_epoch=epoch,
                       device=device,
                       model_path=save_dir)

train_dl, test_dl, item_dl = dg.generate_dataloader(test_user, all_item, batch_size=batch_size)
trainer.fit(train_dl)

epoch: 0


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:39<00:00, 19.36it/s, loss=0.567]


epoch: 1


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:44<00:00, 18.52it/s, loss=0.552]


epoch: 2


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:40<00:00, 19.24it/s, loss=0.561]


epoch: 3


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:40<00:00, 19.12it/s, loss=0.557]


epoch: 4


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:39<00:00, 19.30it/s, loss=0.555]


epoch: 5


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:32<00:00, 20.82it/s, loss=0.551]


epoch: 6


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:37<00:00, 19.88it/s, loss=0.554]


epoch: 7


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:38<00:00, 19.69it/s, loss=0.548]


epoch: 8


train: 100%|█████████████████████████████████████████████████████████| 1930/1930 [01:41<00:00, 19.08it/s, loss=0.555]


epoch: 9


train: 100%|██████████████████████████████████████████████████████████| 1930/1930 [01:31<00:00, 20.99it/s, loss=0.55]


In [4]:
print("inference embedding")
user_embedding = trainer.inference_embedding(model=model, mode="user", data_loader=test_dl, model_path=save_dir)
item_embedding = trainer.inference_embedding(model=model, mode="item", data_loader=item_dl, model_path=save_dir)
#torch.save(user_embedding.data.cpu(), save_dir + "user_embedding.pth")
#torch.save(item_embedding.data.cpu(), save_dir + "item_embedding.pth")
match_evaluation(user_embedding, item_embedding, test_user, all_item, topk=100)

inference embedding


user inference: 100%|██████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.73it/s]
item inference: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.80it/s]


evaluate embedding matching on test data
matching for topk
generate ground truth
compute topk metrics
defaultdict(<class 'list'>, {'NDCG': ['NDCG@100: 0.0496'], 'MRR': ['MRR@100: 0.0152'], 'Recall': ['Recall@100: 0.2149'], 'Hit': ['Hit@100: 0.2149'], 'Precision': ['Precision@100: 0.0021']})
