In [1]:
import sys
sys.path.append("..")

import os
import numpy as np
import pandas as pd
import torch

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from Tools.models.matching.youtube_dnn import YoutubeDNN
from Tools.trainers import MatchTrainer
from Tools.basic.features import DenseFeature, SparseFeature, SequenceFeature
from Tools.utils.match import generate_seq_feature_match, gen_model_input
from Tools.utils.data import df_to_dict, MatchDataGenerator
from movielens_utils import match_evaluation
from YouTubeDNN.data import get_movielens_data

dataset_path="../data/ml-1m.csv"
model_name='youtubednn'
epoch=10
learning_rate=1e-4
batch_size=2048
weight_decay=1e-6
device='cpu'
save_dir='../data/saved/'
seed=2022

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
torch.manual_seed(seed)
user_features, item_features, neg_item_feature, x_train, y_train, all_item, test_user = get_movielens_data(dataset_path)
dg = MatchDataGenerator(x=x_train, y=y_train)

preprocess data


generate sequence features: 100%|███████████████████████████████████████████████| 6040/6040 [00:07<00:00, 845.92it/s]


n_train: 988129, n_test: 6040
0 cold start user droped 


In [3]:
model = YoutubeDNN(user_features, item_features, neg_item_feature, user_params={"dims": [128, 64, 16]}, temperature=0.02)

In [4]:
#mode=1 means pair-wise learning
trainer = MatchTrainer(model,
                       mode=2,
                       optimizer_params={
                           "lr": learning_rate,
                           "weight_decay": weight_decay
                       },
                       n_epoch=epoch,
                       device=device,
                       model_path=save_dir)

train_dl, test_dl, item_dl = dg.generate_dataloader(test_user, all_item, batch_size=batch_size)
trainer.fit(train_dl)

print("inference embedding")
user_embedding = trainer.inference_embedding(model=model, mode="user", data_loader=test_dl, model_path=save_dir)
item_embedding = trainer.inference_embedding(model=model, mode="item", data_loader=item_dl, model_path=save_dir)
print(user_embedding.shape, item_embedding.shape)
#torch.save(user_embedding.data.cpu(), save_dir + "user_embedding.pth")
#torch.save(item_embedding.data.cpu(), save_dir + "item_embedding.pth")
match_evaluation(user_embedding, item_embedding, test_user, all_item, topk=100)

epoch: 0


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 37.16it/s, loss=1.86]


epoch: 1


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 39.10it/s, loss=1.54]


epoch: 2


train: 100%|█████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 39.05it/s, loss=1.4]


epoch: 3


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 38.65it/s, loss=1.29]


epoch: 4


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 38.76it/s, loss=1.21]


epoch: 5


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 38.67it/s, loss=1.15]


epoch: 6


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 38.81it/s, loss=1.11]


epoch: 7


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 38.22it/s, loss=1.07]


epoch: 8


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 39.00it/s, loss=1.05]


epoch: 9


train: 100%|████████████████████████████████████████████████████████████| 483/483 [00:12<00:00, 38.95it/s, loss=1.01]


inference embedding


user inference: 100%|██████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.42it/s]
item inference: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.94it/s]


torch.Size([6040, 16]) torch.Size([3706, 16])
evaluate embedding matching on test data
matching for topk
generate ground truth
compute topk metrics
defaultdict(<class 'list'>, {'NDCG': ['NDCG@100: 0.0266'], 'MRR': ['MRR@100: 0.0054'], 'Recall': ['Recall@100: 0.1334'], 'Hit': ['Hit@100: 0.1334'], 'Precision': ['Precision@100: 0.0013']})
