In [3]:
import os
import re
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

In [4]:
import sys
sys.path.append("../scripts/")

In [6]:
T.manual_seed(3007)
T.cuda.manual_seed(3007)

In [7]:
from bert4rec_model import RecommendationTransformer

In [8]:
VOCAB_SIZE = 59049
heads = 4
layers = 6
emb_dim = 256
pad_id = 0
history = 120

model = RecommendationTransformer(vocab_size=VOCAB_SIZE,
                                  heads=heads,
                                  layers=layers,
                                  emb_dim=emb_dim,
                                  pad_id=pad_id,
                                  num_pos=history)

In [10]:
model_dict = T.load("../models/rec-transformer-model-2/model_files/bert4rec-state-dict.pth", map_location="cpu")["state_dict"]

In [11]:
model.load_state_dict(model_dict)

<All keys matched successfully>

In [12]:
model.eval()

RecommendationTransformer(
  (encoder): Encoder(
    (word_embedding): Embedding(59049, 256, padding_idx=0)
    (position_encoding): PositionalEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layer_stack): ModuleList(
      (0): EncoderLayer(
        (self_attention): MultiHeadAttention(
          (toquery): Linear(in_features=256, out_features=1024, bias=False)
          (tokey): Linear(in_features=256, out_features=1024, bias=False)
          (tovalue): Linear(in_features=256, out_features=1024, bias=False)
          (union): Linear(in_features=1024, out_features=256, bias=False)
          (attention): ScaledDotProductAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
        )
        (pos_ff): PositionWiseFeedForward(
          (feedforward_1): Linear(in_features=256, out_features=1024, bias=True)
          (feedforwar

In [13]:
valid_df = pd.read_csv("../data/ml-25m/ml-25m/ratings_mapped.csv")

In [14]:
valid_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieId_mapped
0,2262,21,3.0,789652009,22
1,2262,1079,3.0,789652009,1054
2,2262,47,5.0,789652009,48
3,102689,1,4.0,822873600,2
4,102689,39,5.0,822873600,40


In [16]:
valid_df_grouped = valid_df.groupby(by = "userId")

In [17]:
valid_df_groups = list(valid_df_grouped.groups)

In [18]:
len(valid_df_groups)

162541

In [20]:
type(valid_df_groups)

list

In [24]:
items_df = valid_df_grouped.get_group(valid_df_groups[0])

In [25]:
end_ix = items_df.shape[0]

In [26]:
end_ix

70

In [27]:
start_ix = max(0, 70 - 5)

In [28]:
start_ix

65

In [29]:
sequence = items_df[start_ix:end_ix]

In [30]:
sequence

Unnamed: 0,userId,movieId,rating,timestamp,movieId_mapped
10979350,1,27193,3.0,1147879774,9122
10979351,1,5684,2.0,1147879797,5575
10979352,1,7318,2.0,1147879850,7195
10979353,1,296,5.0,1147880044,294
10979354,1,7361,5.0,1147880055,7238


In [31]:
sequence.movieId_mapped.tolist()

[9122, 5575, 7195, 294, 7238]

In [32]:
inp_tnsr = T.LongTensor(sequence.movieId_mapped.tolist()).unsqueeze(0)

In [48]:
t = T.ones((10), dtype=T.long)
t[:inp_tnsr.size(1)] = inp_tnsr.squeeze(0)

In [49]:
t

tensor([9122, 5575, 7195,  294, 7238,    1,    1,    1,    1,    1])

In [50]:
inp_tnsr.size()

torch.Size([1, 5])

In [56]:
op = model(t.unsqueeze(0), None)

In [57]:
op.size()

torch.Size([1, 59049, 10])

In [58]:
op.max(1)[-1]

tensor([[9122, 5575, 7195,  294, 7238, 4889, 4889, 4889, 4889, 4889]])

In [59]:
inp_tnsr

tensor([[9122, 5575, 7195,  294, 7238]])

In [76]:
l = [1, 2, 3, 4, 5]

In [78]:
l[::-1][:3][::-1]

[3, 4, 5]

In [121]:
def seqPrediction(input_sequence, num_predicts = 10):

    history = []
    predict_hist = 0
    while predict_hist < num_predicts:
        if len(input_sequence) > 120 - 1:
            history.extend(input_sequence)
            input_sequence = input_sequence[::-1][:119][::-1]
        inp_seq = T.LongTensor(input_sequence)
        inp_tnsr = T.ones((len(input_sequence) + 1), dtype=T.long)
        inp_tnsr[:inp_seq.size(0)] = inp_seq
        op = model(inp_tnsr.unsqueeze(0), None)
        _, pred = op.max(1)
        pred = pred.flatten().tolist()[-1]
        input_sequence.append(pred)
        predict_hist += 1
    
    return input_sequence, history



In [122]:
target_seq, history = seqPrediction(sequence.movieId_mapped.tolist(), 200)

In [123]:
# predict_hist = 1
# valid_history = 10
# target_seq = sequence.movieId_mapped.tolist()
# while predict_hist < valid_history:
#     print("PHIST: ", predict_hist)
#     inp_seq = T.LongTensor(target_seq)
#     inp_tnsr = T.ones((len(target_seq) + 1), dtype=T.long)
#     inp_tnsr[:inp_seq.size(0)] = inp_seq
#     op = model(inp_tnsr.unsqueeze(0), None)
#     _, pred = op.max(1)
#     pred = pred.flatten().tolist()[-1]
#     target_seq.append(pred)
#     predict_hist += 1

In [124]:
len(set(target_seq))

14

In [125]:
len(set(history))

27

In [86]:
movies_df = pd.read_csv("../data/ml-25m/ml-25m/movies_mapped.csv")

In [126]:
movies_df.head()

Unnamed: 0,movieId,title,genres,movieId_mapped
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.0
2,3,Grumpier Old Men (1995),Comedy|Romance,4.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,5.0
4,5,Father of the Bride Part II (1995),Comedy,6.0


In [127]:
# target_seq

In [128]:
len(target_seq)

120

In [129]:
len(set(target_seq))

14

In [132]:
movies_df[movies_df.movieId_mapped.isin(history)]

Unnamed: 0,movieId,title,genres,movieId_mapped
108,110,Braveheart (1995),Action|Drama|War,110.0
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,259.0
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,294.0
314,318,"Shawshank Redemption, The (1994)",Crime|Drama,316.0
351,356,Forrest Gump (1994),Comedy|Drama|Romance|War,353.0
452,457,"Fugitive, The (1993)",Thriller,454.0
522,527,Schindler's List (1993),Drama|War,524.0
585,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,587.0
600,608,Fargo (1996),Comedy|Crime|Drama|Thriller,602.0
840,858,"Godfather, The (1972)",Crime|Drama,842.0


In [73]:
movies_df[movies_df.movieId == 209163]

Unnamed: 0,movieId,title,genres,movieId_mapped
62420,209163,Bad Poems (2018),Comedy|Drama,59046.0


In [75]:
movies_df[movies_df.movieId == 1]

Unnamed: 0,movieId,title,genres,movieId_mapped
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0


In [88]:
movies_df.tail()

Unnamed: 0,movieId,title,genres,movieId_mapped
62418,209157,We (2018),Drama,59044.0
62419,209159,Window of the Soul (2001),Documentary,59045.0
62420,209163,Bad Poems (2018),Comedy|Drama,59046.0
62421,209169,A Girl Thing (2001),(no genres listed),59047.0
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,59048.0


In [92]:
movies_df[movies_df.title.str.contains("2022")]

Unnamed: 0,movieId,title,genres,movieId_mapped
49449,178907,Blade Runner: Black Out 2022 (2017),Action|Animation|Sci-Fi,46524.0


In [135]:
pre_input = [1, 1, 1, 1]+sequence.movieId_mapped.tolist()

In [136]:
pre_input

[1, 1, 1, 1, 9122, 5575, 7195, 294, 7238]

In [143]:
op_pre = model(T.LongTensor(pre_input).unsqueeze(0), None)
_, pred_pre = op_pre.max(1)
pred_pre

tensor([[4889, 4889, 4889, 4889, 9122, 5575, 7195,  294, 7238]])

In [144]:
op_pre = model(T.LongTensor([1, 4889, 9122, 5575, 7195,  294, 7238]).unsqueeze(0), None)
_, pred_pre = op_pre.max(1)
pred_pre

tensor([[4869, 4889, 9122, 5575, 7195,  294, 7238]])

In [153]:
op_pre = model(T.LongTensor([4869, 4889, 9122, 5575, 7195,  294, 7238, 1]).unsqueeze(0), None)
op_pre = op_pre / 0.6
_, pred_pre = op_pre.max(1)
pred_pre

tensor([[4869, 4889, 9122, 5575, 7195,  294, 7238,  316]])

In [151]:
op_pre.topk(4, 1)

torch.return_types.topk(
values=tensor([[[31.7882, 36.6945, 33.3723, 33.5053, 30.2010, 38.8856, 32.2080,
           5.4894],
         [ 9.3825,  7.3324, 12.6242, 13.6178, 11.7001, 10.6517,  9.1645,
           5.1213],
         [ 9.3420,  7.1848, 12.4436, 13.5638,  8.9342,  9.6163,  9.0584,
           5.1149],
         [ 8.3225,  6.1698, 12.0261, 12.8433,  8.5681,  7.6839,  9.0119,
           4.8859]]], grad_fn=<TopkBackward>),
indices=tensor([[[ 4869,  4889,  9122,  5575,  7195,   294,  7238,   316],
         [ 1561,  1177, 10761, 14624,  7237,   524,  7030,  2768],
         [  587,  5271, 26421,   811,  1371,    33,   353,  7030],
         [   35,  1925,  3158,   114,  3162,   652,   316,  2869]]]))

In [156]:
tt = T.LongTensor([12, 2, 3, 4, 5])

In [157]:
tt

tensor([12,  2,  3,  4,  5])

In [160]:
tnsr = T.ones((tt.size(0) + 1), dtype=T.long)

In [161]:
tnsr

tensor([1, 1, 1, 1, 1, 1])

In [176]:
tnsr[1:] = tt

In [177]:
tnsr

tensor([ 1, 12,  2,  3,  4,  5])

In [178]:
import random

In [191]:
bool(random.choice([0, 1]))

True