In [1]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import faiss
from sentence_transformers import SentenceTransformer
import sklearn.metrics as metrics
import argparse

import json

from datasets import Dataset
from torch.utils.data import DataLoader
from torch.utils import data
from tokenizers.processors import TemplateProcessing

import torch

from torch.utils import data
from transformers import AutoTokenizer
from transformers import AutoModel, AdamW, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sentences = ["This is an example sentence", "Each sentence is converted"]

In [3]:
# model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [4]:
# embeddings = model.encode(sentences)
# print(embeddings)

In [5]:
# embeddings.shape

In [6]:
# embeddings

In [7]:
# torch.from_numpy(embeddings).type(torch.FloatTensor)

In [8]:
import pickle

with open("item_triplet_dataset.pickle", "rb") as fr:
    kg_triplet_dataset = pickle.load(fr)

In [9]:
import pickle

with open("user_dataset.pickle", "rb") as fr:
    user_dataset = pickle.load(fr)

In [10]:
lm_mp = {'roberta': 'roberta-base',
         'distilbert': 'distilbert-base-uncased'}

def get_tokenizer(lm):
    if lm in lm_mp:
        return AutoTokenizer.from_pretrained(lm_mp[lm])
    else:
        return AutoTokenizer.from_pretrained(lm)

In [11]:
tokenizer = get_tokenizer('roberta-base')

In [12]:
mid_set = set(list(kg_triplet_dataset.keys()))

In [13]:
import random

random.sample(list(range(10)), 4)

[2, 7, 0, 9]

In [14]:
"TITLE " + " TITLE ".join(list(user_dataset[0][0].values())[1])

"TITLE U Turn TITLE Bob Roberts TITLE Silence of the Lambs, The TITLE City of Lost Children, The TITLE Postino, Il TITLE As Good As It Gets TITLE Philadelphia TITLE Batman TITLE Secret of Roan Inish, The TITLE Schindler's List TITLE Executive Decision TITLE Wolf TITLE Get Shorty TITLE Starship Troopers TITLE Fifth Element, The TITLE Alice in Wonderland TITLE Time to Kill, A TITLE Alien: Resurrection TITLE Men in Black TITLE Shawshank Redemption, The TITLE Gattaca TITLE Heavenly Creatures TITLE Wrong Trousers, The TITLE Lone Star TITLE Blues Brothers, The TITLE Terminator, The TITLE Abyss, The TITLE From Dusk Till Dawn TITLE Seven (Se7en) TITLE Harold and Maude TITLE Stripes TITLE Young Poisoner's Handbook, The TITLE Good Will Hunting TITLE Terminator 2: Judgment Day TITLE To Kill a Mockingbird TITLE Ed Wood TITLE Die xue shuang xiong (Killer, The) TITLE Jurassic Park TITLE Nikita (La Femme Nikita) TITLE Shine TITLE Usual Suspects, The TITLE Doors, The TITLE Princess Bride, The TITLE Ro

In [15]:
import random

train_dataset = []

for X1_mtitle, X2_mtitle, X1_mid, X2_mid in zip(user_dataset[0][0].values(), user_dataset[0][1].values(), user_dataset[0][2].values(), user_dataset[0][3].values()):
    neg_mid_list = random.sample(list(mid_set - set(X1_mid)), len(X1_mid))
    pos_mid_list = X1_mid
    user_interacted_movie = "[CLS] TITLE " + "TITLE " + " TITLE ".join(list(X1_mtitle)) + " [SEP]"
    for neg_mid, pos_mid in zip(neg_mid_list, pos_mid_list):
        pos_s1 = kg_triplet_dataset[pos_mid]
        neg_s1 = kg_triplet_dataset[neg_mid]
        s2 = user_interacted_movie
        print(pos_s1)
        print(neg_s1)
        print(s2)
        interaction = (pos_s1, neg_s1, s2)
        train_dataset.append(interaction)
        break

    neg_mid_list = random.sample(list(mid_set - set(X2_mid)), len(X2_mid))
    pos_mid_list = X2_mid
    user_interacted_movie = "[CLS] TITLE " + " TITLE ".join(list(X2_mtitle)) + " [SEP]"
    for neg_mid, pos_mid in zip(neg_mid_list, pos_mid_list):
        pos_s1 = kg_triplet_dataset[pos_mid]
        neg_s1 = kg_triplet_dataset[neg_mid]
        s2 = user_interacted_movie
        print(pos_s1)
        print(neg_s1)
        print(s2)
        interaction = (pos_s1, neg_s1, s2)
        train_dataset.append(interaction)
        break
    break

print(train_dataset)

[CLS] Saint, The [SEP] released year is 1990's, genres are consisted as Action Romance Thriller
[CLS] Scarlet Letter, The [SEP] released year is 1990's, genres are consisted as Drama
[CLS] TITLE TITLE Saint, The TITLE Ulee's Gold TITLE Manchurian Candidate, The TITLE Raiders of the Lost Ark TITLE Streetcar Named Desire, A TITLE Chasing Amy TITLE Star Wars TITLE L.A. Confidential TITLE City of Lost Children, The TITLE Usual Suspects, The [SEP]
[CLS] Leaving Las Vegas [SEP] released year is 1990's, genres are consisted as Drama Romance, origin country is FR, directed by Mike Figgis, acted by Nicolas Cage [SEP]
[CLS] Paris, Texas [SEP] released year is 1980's, genres are consisted as Drama, origin country is DE, written by L.M. Kit Carson, directed by Wim Wenders, acted by Nastassja Kinski [SEP]
[CLS] TITLE Leaving Las Vegas TITLE Big Night TITLE Three Colors: Red TITLE Full Monty, The TITLE Sting, The TITLE Princess Bride, The TITLE Three Colors: Blue TITLE English Patient, The TITLE Dou

In [16]:
len(user_dataset[0])

4

In [17]:
# user_dataset[0]

In [18]:
class RetrievalDataset(data.Dataset):
    """Retrieval dataset"""
    
    '''
    train dataset:
    train_dataset[idx]: 1개 pair
    pos_item_tokenized, neg_item_tokenized, user_interacted_tokenized = train_dataset[idx]

    valid/test dataset:
    valid_dataset[idx]: 1개 pair
    (pos_item_tokenized, user_interacted_tokenized), (eval_pos_item_tokenized, eval_user_interacted_tokenized) = validtrain_dataset[idx]
    '''

    def __init__(self,
                 mode,
                 user_dataset,
                 kg_triplet_dataset,
                 max_len=256,
                 size=None,
                 lm='roberta',
                 negative_sampling_raito=20):
        self.tokenizer = get_tokenizer(lm)
        self.max_len = max_len
        self.mode = mode
        self.size = size
        self.negative_sampling_raito = negative_sampling_raito
        self.user_dataset = user_dataset
        self.kg_triplet_dataset = kg_triplet_dataset
        self.num_item = len(kg_triplet_dataset)

        if mode == 'train':
            self.pairs = self.set_dataset()
        else:
            self.pairs, self.eval_pairs = self.set_dataset()


    def __len__(self):
        """Return the size of the dataset."""
        return len(self.pairs)

    def __getitem__(self, idx):
        """Return a tokenized item of the dataset.

        Args:
            idx (int): the index of the item

        Returns:
            List of int: token ID's of the pos triplet
            List of int: token ID's of the neg triplet
            List of int: token ID's of the user interacted movies
        """
        if self.mode == 'train':
            user_interacted = self.pairs[idx][0]
            pos_item = self.pairs[idx][1]
            neg_item = self.pairs[idx][2]
        
            user_interacted_tokenized = self.tokenizer.encode(text=user_interacted,
                                    max_length=self.max_len,
                                    truncation=True)
            pos_item_tokenized = self.tokenizer.encode(text=pos_item,
                                    max_length=self.max_len,
                                    truncation=True)
            neg_item_tokenized = self.tokenizer.encode(text=neg_item,
                                    max_length=self.max_len,
                                    truncation=True)

            return (user_interacted_tokenized, pos_item_tokenized, neg_item_tokenized)
        
        else:
            user_interacted = self.pairs[idx][0]
            pos_item = self.pairs[idx][1]
            pos_item_id = self.pairs[idx][2]
            user_interacted_tokenized = self.tokenizer.encode(text=user_interacted,
                                    max_length=self.max_len,
                                    truncation=True)
            pos_item_tokenized = self.tokenizer.encode(text=pos_item,
                                    max_length=self.max_len,
                                    truncation=True)
            
            eval_user_interacted = self.eval_pairs[idx][0]
            eval_pos_item = self.eval_pairs[idx][1]
            eval_pos_item_id = self.eval_pairs[idx][2]
            eval_user_interacted_tokenized = self.tokenizer.encode(text=eval_user_interacted,
                                    max_length=self.max_len,
                                    truncation=True)
            eval_pos_item_tokenized = self.tokenizer.encode(text=eval_pos_item,
                                    max_length=self.max_len,
                                    truncation=True)

            return (user_interacted_tokenized, pos_item_tokenized, pos_item_id, eval_user_interacted_tokenized, eval_pos_item_tokenized, eval_pos_item_id)


    def set_dataset(self):
        import random

        idx_dict = {'train':0, 'valid':1, 'test':2}
        mode_idx = idx_dict[self.mode]
        dataset = []
        
        if self.mode != "train":
            eval_dataset = []

        for X1_mtitle, X2_mtitle, X1_mid, X2_mid in zip(self.user_dataset[mode_idx][0].values(), self.user_dataset[mode_idx][1].values(), self.user_dataset[mode_idx][2].values(), self.user_dataset[mode_idx][3].values()):
            if self.mode == 'train':
                num_neg_sample = min(len(X2_mid) * self.negative_sampling_raito, self.num_item - len(X2_mid) - 100)
            else:
                num_neg_sample = len(X1_mid)
            neg_mid_list = random.sample(list(mid_set - set(X1_mid)), num_neg_sample)
            pos_mid_list = []
            if self.mode == 'train':
                for _ in range(self.negative_sampling_raito):
                    pos_mid_list += X1_mid
            else:
                pos_mid_list = X1_mid
            user_interacted_movie = "[CLS] TITLE " + "TITLE " + " TITLE ".join(list(X1_mtitle)) + " [SEP]"
            print(user_interacted_movie)
            for neg_mid, pos_mid in zip(neg_mid_list, pos_mid_list):
                s1 = user_interacted_movie
                pos_s2 = self.kg_triplet_dataset[pos_mid]
                neg_s2 = self.kg_triplet_dataset[neg_mid]
                
                if self.mode == 'train':
                    interaction = (s1, pos_s2, neg_s2)
                elif (self.mode == "valid" or self.mode == "test"):
                    interaction = (s1, pos_s2, pos_mid)
                dataset.append(interaction)
            
            user_interacted_movie = "[CLS] TITLE " + "TITLE " + " TITLE ".join(list(X2_mtitle)) + " [SEP]"
            print(user_interacted_movie)
            print("*" * 100)
            if self.mode == 'train':
                # 나머지 절반도 train에 함께 사용
                num_neg_sample = min(len(X2_mid) * self.negative_sampling_raito, self.num_item - len(X2_mid) - 100)
                neg_mid_list = random.sample(list(mid_set - set(X2_mid)), num_neg_sample)
                pos_mid_list = []
                for _ in range(self.negative_sampling_raito):
                    pos_mid_list += X2_mid
                for neg_mid, pos_mid in zip(neg_mid_list, pos_mid_list):
                    s1 = user_interacted_movie
                    pos_s2 = self.kg_triplet_dataset[pos_mid]
                    neg_s2 = self.kg_triplet_dataset[neg_mid]
                    
                    interaction = (s1, pos_s2, neg_s2)
                    dataset.append(interaction)
                return dataset
            
            elif (self.mode == "valid" or self.mode == "test"):
                # 절반은 faiss input으로, 나머지 절반은 eval을 위해 사용
                pos_mid_list = X2_mid
                for neg_mid, pos_mid in zip(neg_mid_list, pos_mid_list):
                    s1 = user_interacted_movie
                    pos_s2 = self.kg_triplet_dataset[pos_mid]
                    
                    interaction = (s1, pos_s2, pos_mid)
                    eval_dataset.append(interaction)
                return dataset, eval_dataset
        
            
    @staticmethod
    def pad(batch):
        """Merge a list of dataset items into a train/test batch
        Args:
            batch (list of tuple): a list of dataset items

        Returns:
            LongTensor: x1 of shape (batch_size, seq_len)
            LongTensor: x2 of shape (batch_size, seq_len).
                        Elements of x1 and x2 are padded to the same length
            LongTensor: a batch of labels, (batch_size,)
        """
        # if len(batch) == 3:
        # TODO: try-except 말고 if문으로 처리하기
        try:
            u, x1, x2 = zip(*batch)

            maxlen = max([len(x) for x in u + x1 + x2])
            u = [xi + [0]*(maxlen - len(xi)) for xi in u]
            x1 = [xi + [0]*(maxlen - len(xi)) for xi in x1]
            x2 = [xi + [0]*(maxlen - len(xi)) for xi in x2]
            return torch.LongTensor(u), \
                    torch.LongTensor(x1), \
                    torch.LongTensor(x2)
        # else: # for valid/test
        except:
            input_u, input_x1, input_id, eval_u, eval_x1, eval_id = zip(*batch)

            input_maxlen = max([len(x) for x in input_u + input_x1])
            input_u = [xi + [0]*(input_maxlen - len(xi)) for xi in input_u]
            input_x1 = [xi + [0]*(input_maxlen - len(xi)) for xi in input_x1]
            
            eval_maxlen = max([len(x) for x in eval_u + eval_x1])
            eval_u = [xi + [0]*(eval_maxlen - len(xi)) for xi in eval_u]
            eval_x1 = [xi + [0]*(eval_maxlen - len(xi)) for xi in eval_x1]
            return torch.LongTensor(input_u), \
                   torch.LongTensor(input_x1), \
                   input_id, \
                   torch.LongTensor(eval_u), \
                   torch.LongTensor(eval_x1), \
                   eval_id

In [19]:
triplet_encoding = {mid:idx for idx, mid in zip(list(range(len(kg_triplet_dataset.keys()))), list(kg_triplet_dataset.keys()))}
triplet_decoding = {idx:mid for idx, mid in zip(list(range(len(kg_triplet_dataset.keys()))), list(kg_triplet_dataset.keys()))}

In [20]:
list(kg_triplet_dataset.keys())[-1]

1674

In [21]:
class TripletDataset(data.Dataset):
    """Triplet dataset"""

    def __init__(self,
                 kg_triplet_dataset,
                 max_len=256,
                 size=None,
                 lm='roberta',):
        self.tokenizer = get_tokenizer(lm)
        self.max_len = max_len
        self.size = size
        self.kg_triplet_dataset = kg_triplet_dataset
        self.num_item = len(kg_triplet_dataset)
        self.triplets = self.set_dataset()


    def __len__(self):
        """Return the size of the dataset."""
        return len(self.triplets)

    def __getitem__(self, idx):
        """Return a tokenized item of the dataset.

        Args:
            idx (int): the index of the item

        Returns:
            List of int: token ID's of the pos triplet
            List of int: token ID's of the neg triplet
            List of int: token ID's of the user interacted movies
        """
        triplet = self.triplets[idx][0]
        item_id = self.triplets[idx][2]
        triplet_tokenized = self.tokenizer.encode(text=triplet,
                                    max_length=self.max_len,
                                    truncation=True)
        return (None, triplet_tokenized, item_id)
        
    def set_dataset(self):
        triplets = []
        for mid, triplet in self.kg_triplet_dataset.items():
            triplets.append((triplet, None, mid))
        return triplets
            
    @staticmethod
    def pad_(batch):
        """Merge a list of dataset items into a train/test batch
        Args:
            batch (list of tuple): a list of dataset items

        Returns:
            LongTensor: x1 of shape (batch_size, seq_len)
            LongTensor: x2 of shape (batch_size, seq_len).
                        Elements of x1 and x2 are padded to the same length
            LongTensor: a batch of labels, (batch_size,)
        """
        _, t, item_id = zip(*batch)

        maxlen = max([len(x) for x in t])
        t = [xi + [0]*(maxlen - len(xi)) for xi in t]
        return torch.LongTensor(t), \
               item_id

In [22]:
# load train/dev/test sets
train_dataset = RetrievalDataset('train',
                                user_dataset=user_dataset,
                                kg_triplet_dataset=kg_triplet_dataset)
valid_dataset = RetrievalDataset('valid',
                                user_dataset=user_dataset,
                                kg_triplet_dataset=kg_triplet_dataset)
test_dataset = RetrievalDataset('test',
                                user_dataset=user_dataset,
                                kg_triplet_dataset=kg_triplet_dataset)
triplet_dataset = TripletDataset(kg_triplet_dataset=kg_triplet_dataset)

[CLS] TITLE TITLE Saint, The TITLE Ulee's Gold TITLE Manchurian Candidate, The TITLE Raiders of the Lost Ark TITLE Streetcar Named Desire, A TITLE Chasing Amy TITLE Star Wars TITLE L.A. Confidential TITLE City of Lost Children, The TITLE Usual Suspects, The [SEP]
[CLS] TITLE TITLE Leaving Las Vegas TITLE Big Night TITLE Three Colors: Red TITLE Full Monty, The TITLE Sting, The TITLE Princess Bride, The TITLE Three Colors: Blue TITLE English Patient, The TITLE Double vie de Véronique, La (Double Life of Veronique, The) TITLE Butch Cassidy and the Sundance Kid [SEP]
****************************************************************************************************
[CLS] TITLE TITLE Monty Python and the Holy Grail TITLE Maverick TITLE Sleepless in Seattle TITLE Powder TITLE 12 Angry Men TITLE Truth About Cats & Dogs, The TITLE Shawshank Redemption, The TITLE Lost World: Jurassic Park, The TITLE Silence of the Lambs, The TITLE Jerry Maguire TITLE Matilda TITLE Mr. Holland's Opus TITLE Walk

In [23]:
len(train_dataset[0][2])

28

In [24]:
padder = train_dataset.pad
padder_ = triplet_dataset.pad_
batch_size = 32
n_epochs = 200

In [25]:
train_dataset

<__main__.RetrievalDataset at 0x7f86b41e66d0>

In [26]:
train_dataloader = data.DataLoader(dataset=train_dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=0,
                                collate_fn=padder)
valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=padder)
test_dataloader = data.DataLoader(dataset=test_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=padder)
triplet_dataloader = data.DataLoader(dataset=triplet_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=padder_)

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [28]:
torch.nn.Linear(43, 1).weight.dtype

torch.float32

In [29]:
lm_name = 'roberta'

if lm_name in lm_mp:
    lm = AutoModel.from_pretrained(lm_mp[lm_name])
else:
    lm = AutoModel.from_pretrained(lm_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
class RetrievalModel(nn.Module):
    """A baseline model for Retrieval."""
    
    def __init__(self, lm, device='cuda', method='bpr'):
        super().__init__()
        
        if method not in ['bpr', 'siamses']:
            raise NotImplementedError("Method not implemented.")

        self.device = device
        self.model = lm.to(self.device)
        self.method = method
        
        # linear layer
        hidden_size = self.model.config.hidden_size
        self.fc_u = torch.nn.Linear(hidden_size, 32)
        self.fc_pos = torch.nn.Linear(hidden_size, 32)
        self.fc_neg = torch.nn.Linear(hidden_size, 32)

    def forward(self, u, pos, neg=None):
        """Encode the left, right, and the concatenation of left + right.

        Args:
            x1 (LongTensor): a batch of ID's
            x2 (LongTensor, optional): a batch of ID's (augmented)

        Returns:
            Tensor: binary prediction
        """
        if self.method == 'bpr' or self.method == 'siamses':
            u = u.to(self.device)
            u_embedding = self.model(u)[0][:, 0, :]
            u_embedding = self.fc_u(u_embedding)
            
            pos = pos.to(self.device)
            pos_embedding = self.model(pos)[0][:, 0, :]
            pos_embedding = self.fc_pos(pos_embedding)
            
            if neg is not None:
                neg = neg.to(self.device)
                neg_embedding = self.model(neg)[0][:, 0, :]
                neg_embedding = self.fc_neg(neg_embedding)
                
        # elif self.method == 'siamses':
        #     u = u.to(self.device)
        #     u_embedding = self.model(u)[0][:, 0, :]
            
        #     pos = pos.to(self.device)
        #     pos_embedding = self.model(pos)[0][:, 0, :]
            
        #     if neg is not None:
        #         neg = neg.to(self.device)
        #         neg_embedding = self.model(neg)[0][:, 0, :]
                
        else:
            raise NotImplementedError("Method not implemented.")
        
        return (u_embedding, pos_embedding, neg_embedding) if neg is not None else (u_embedding, pos_embedding)

In [31]:
class ContrastiveLoss(nn.Module):
    """A baseline model for Retrieval."""
    
    def __init__(self, device='cuda', method='bpr', margin=2.0):
        super().__init__()
        self.method = method
        self.margin = margin

    def forward(self, u, pos, neg):
        """Encode the left, right, and the concatenation of left + right.

        Args:
            x1 (LongTensor): a batch of ID's
            x2 (LongTensor, optional): a batch of ID's (augmented)

        Returns:
            Tensor: binary prediction
        """
        if self.method == 'bpr':
            pos_scores = torch.sum(torch.mul(u, pos), dim=1)
            neg_scores = torch.sum(torch.mul(u, neg), dim=1)

            maxi = F.logsigmoid(pos_scores - neg_scores)
            loss = -torch.mean(maxi)

            return loss

        elif self.method == 'siamses':
            pos_euclidean_distance = F.pairwise_distance(u, pos, keepdim = True)
            neg_euclidean_distance = F.pairwise_distance(u, neg, keepdim = True)
            
            pos_loss = torch.mean(torch.pow(torch.clamp(self.margin - pos_euclidean_distance, min=0.0), 2))
            neg_loss = torch.mean(torch.pow(neg_euclidean_distance, 2))
            
            loss = torch.mean(pos_loss + neg_loss)

            return loss

In [32]:
# model = RetrievalModel(method='siamses')
# model = model.cuda()
# optimizer = AdamW(model.parameters(), lr=2e-5)

# num_steps = (len(train_dataset) // batch_size) * n_epochs
# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                             num_warmup_steps=0,
#                                             num_training_steps=num_steps)

# loss_func = ContrastiveLoss(method='siamses')

In [50]:
method='siamses'
# method='bpr'

In [51]:
model = RetrievalModel(lm, method=method).cuda()
optimizer = AdamW(model.parameters(), lr=2e-7)

num_steps = (len(train_dataset) // batch_size) * n_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=num_steps)

criterion = ContrastiveLoss(method=method)



In [52]:
def train(model, loss_func, optimizer, train_dataloader):
    model.train()
    criterion = loss_func
    
    for epoch in range(n_epochs + 1):
        total_loss = []
        for i, batch in enumerate(train_dataloader):
            embeddings = model(*batch)
            u_embedding, pos_embedding, neg_embedding = embeddings
            loss = criterion(u_embedding, pos_embedding, neg_embedding)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss.append(loss.item())
        if epoch % 5 == 0:
            mean_loss = sum(total_loss) / len(total_loss)
            print(f"e_{epoch}_loss_{mean_loss}")
            torch.save(model.state_dict(), f"./models/e_{epoch}_loss_{mean_loss}_m_{method}_retrieval.pt")

In [53]:
train(model, criterion, optimizer, train_dataloader)

e_0_loss_3.627875804901123
e_5_loss_1.5681993044339693
e_10_loss_0.8833112991773165
e_15_loss_0.8025010594954858
e_20_loss_0.7817134857177734
e_25_loss_0.5641518487380102
e_30_loss_0.5016957819461823
e_35_loss_0.4145242205032936
e_40_loss_0.35355111727347743
e_45_loss_0.30279651054969203
e_50_loss_0.3316923196499164
e_55_loss_0.2821713605752358
e_60_loss_0.2730407634606728
e_65_loss_0.2522696726597272
e_70_loss_0.23732612568598527
e_75_loss_0.2114024471778136
e_80_loss_0.22262986577474153
e_85_loss_0.20590451130500206
e_90_loss_0.1758362576365471
e_95_loss_0.1726947965530249
e_100_loss_0.160911704485233
e_105_loss_0.15755002544476435
e_110_loss_0.14971904571239764
e_115_loss_0.16033542385468116
e_120_loss_0.13447520709954774
e_125_loss_0.1398966283752368
e_130_loss_0.13954423711850092
e_135_loss_0.13122025304115736
e_140_loss_0.1289894368786078
e_145_loss_0.12951180109610924
e_150_loss_0.13249714787189776
e_155_loss_0.12929004086897924
e_160_loss_0.11954429688361976
e_165_loss_0.123220

In [42]:
# for epoch in range(n_epochs + 1):
#     for i, batch in enumerate(train_dataloader):
#         u, pos, neg = batch
#         u_embedding, pos_embedding, neg_embedding = model(*batch)
#         loss = criterion(u_embedding, pos_embedding, neg_embedding)
#         loss.backward()
#         optimizer.step()
#         scheduler.step()

In [43]:
# u_embedding, pos_embedding, neg_embedding = model(*batch)

In [44]:
# model.load_state_dict(torch.load('./models/e_125_loss_2.0084793329238892_m_siamses_retrieval.pt', map_location='cuda'))

In [45]:
# for batch in triplet_dataloader:
#     triplet = batch
#     # TODO model 코드 변경해서 user emb 만들 때 None 입력될 수 있게 하자.
#     embeddings = model(triplet[0], triplet[0])
#     _, triplet_embedding = embeddings
#     index = faiss.IndexFlatL2(triplet_embedding.shape[1])
#     break

In [46]:
# for batch in triplet_dataloader:
#     triplet, item_id = batch
#     # TODO model 코드 변경해서 user emb 만들 때 None 입력될 수 있게 하자.
#     embeddings = model(triplet, triplet)
#     _, triplet_embedding = embeddings
#     index.add(triplet_embedding.cpu().detach().numpy())

In [54]:
import itertools

path = './models/'
file_list = sorted(os.listdir(path))

for file_name in file_list:
    if not file_name.endswith('.pt'):
        continue
    else:
        print(file_name + "\n\n")
        model.load_state_dict(torch.load(f'./models/{file_name}', map_location='cuda'))

        for batch in triplet_dataloader:
            triplet = batch
            # TODO model 코드 변경해서 user emb 만들 때 None 입력될 수 있게 하자.
            embeddings = model(triplet[0], triplet[0])
            _, triplet_embedding = embeddings
            index = faiss.IndexFlatL2(triplet_embedding.shape[1])
            break

        for batch in triplet_dataloader:
            triplet, item_id = batch
            # TODO model 코드 변경해서 user emb 만들 때 None 입력될 수 있게 하자.
            embeddings = model(triplet, triplet)
            _, triplet_embedding = embeddings
            index.add(triplet_embedding.cpu().detach().numpy())

        top_k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        # top_k = min((len(input_pos) + len(eval_pos)) * 10, index.ntotal)
        tmp = 0
        for top_k in top_k_list:
            # 각 배치마다 유저 1명의 batch_size명 존재
            print(f"{top_k=}" + "*" * 100)
            for i, batch in enumerate(itertools.chain(valid_dataloader, test_dataloader)):
                input_u, input_pos, input_id, eval_u, eval_pos, eval_id = batch
                # print(input_id)
                # print(eval_id)
                embeddings = model(input_u, input_pos)
                u_embedding, pos_embedding = embeddings
                distances, indices = index.search(u_embedding[0, :].unsqueeze(dim=0).cpu().detach().numpy(), top_k)
                # print((indices))
                hit = 0
                for idx in indices[0]:
                    # triplet_decoding[idx] -> faiss로 retrieval된 item_id(실제 movielens movie id)
                    if triplet_decoding[idx] in list(eval_id):
                        print(triplet_decoding[idx])
                        print(kg_triplet_dataset[triplet_decoding[idx]])
                        hit += 1
                print(f"hit@{top_k} = {hit / top_k}")
                tmp += hit / top_k
                break
        print(tmp / len(top_k_list))

e_0_loss_3.627875804901123_m_siamses_retrieval.pt


top_k=10****************************************************************************************************
hit@10 = 0.0
top_k=20****************************************************************************************************
hit@20 = 0.0
top_k=30****************************************************************************************************
hit@30 = 0.0
top_k=40****************************************************************************************************
hit@40 = 0.0
top_k=50****************************************************************************************************
hit@50 = 0.0
top_k=60****************************************************************************************************
hit@60 = 0.0
top_k=70****************************************************************************************************
hit@70 = 0.0
top_k=80**************************************************************************************