In [None]:
! pip install -q tensorflow-recommenders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pprint

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from collections import defaultdict
from tqdm import tqdm
import random

In [None]:
folder_path = '/content/drive/MyDrive/Movie Recommendations with Movielens/data/'

In [None]:
maxlen = 50
batch_size = 4096
embedding_dimension = 64

num_heads = 1
ffn_hidden_unit = 64
dropout = 0.5
use_causal_mask = False
blocks = 2
learning_rate = 0.001
epoch = 100

negative_example = 100


In this project, I impliment the model proposed in paper [Self-Attentive Sequential Recommendaiton](https://arxiv.org/pdf/1808.09781.pdf).

The dataset is collected from the movie-recommendation service MovieLens. Created by 138,493 users, the Movielens data set includes over 20 million ratings and 460,000+ tags for 27,278 movies.

Kaggle data set: [MovieLens 20M Dataset](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?resource=download)


# 1. Data Preprocess


### 1.1 Filter
I only include movies with at least 5 ratings(watchs). And I only include recodes with rating >= 2, in other words, only include watch history that the user enjoy.

We end up with about 18M records.

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Movie Recommendations with Movielens/data/rating.csv',
                      #usecols = ['userId', 'movieId', 'timestamp'],
                      #dtype = {'movieId': str, 'userId': str},
                      #nrows = 1000000
                    )

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
ratings['movie_count'] = ratings.groupby('movieId')['movieId'].transform('count')
#
ratings = ratings[ratings.movie_count >= 5]
# only include recodes with rating >= 2
# in other words, only include watch history that the user enjoy
ratings = ratings[ratings.rating >= 2]
ratings = ratings.sort_values(by=['userId', 'timestamp'])

In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18786848 entries, 20 to 19999916
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   timestamp    object 
 4   movie_count  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 860.0+ MB


In [None]:
#folder_path = "/content/drive/MyDrive/Movie Recommendations with Movielens/"
ratings.to_csv(folder_path + 'rating_filtered.csv', index = False)

### 1.2 Train and Test Split
We split the historical sequence for each user in to three parts:
1.   the most recent action for testing
2.   the second most recent action fro validation
3.   all remaining actions for training



In [None]:
#folder_path = "/content/drive/MyDrive/Movie Recommendations with Movielens/"
ratings = pd.read_csv(folder_path + 'rating_filtered.csv',
                      usecols = ['userId', 'movieId'],
                      dtype = {'movieId': str, 'userId': str},
                      )

In [None]:
train_data, val_data, test_data = defaultdict(list), defaultdict(list), defaultdict(list)

In [None]:
for userId, df in tqdm(ratings[['userId', 'movieId']].groupby('userId')):
    pos_list = df['movieId'].tolist()

    for i in range(1, len(pos_list)):
        hist_i = pos_list[max(0,i-maxlen):i]
        if i == len(pos_list) - 1:
            test_data['hist'].append(hist_i)
            test_data['pos_id'].append(pos_list[i])
        elif i == len(pos_list) - 2:
            val_data['hist'].append(hist_i)
            val_data['pos_id'].append(pos_list[i])
        else:
        #if i < len(pos_list) - 2:
            train_data['hist'].append(hist_i)
            train_data['pos_id'].append(pos_list[i])




100%|██████████| 138469/138469 [01:23<00:00, 1659.09it/s]


In [None]:
print(train_data['hist'][0:5])
print(train_data['pos_id'][:5])

[['924'], ['924', '919'], ['924', '919', '2683'], ['924', '919', '2683', '1584'], ['924', '919', '2683', '1584', '1079']]
['919', '2683', '1584', '1079', '653']


In [None]:
test_df = pd.DataFrame(test_data)
test_df['hist'] = test_df['hist'].apply(lambda x: ','.join([id for id in x]))
test_df.to_csv(folder_path + 'test_SASRec.csv', index = False, sep='|' )


In [None]:
val_df = pd.DataFrame(val_data)
val_df['hist'] = val_df['hist'].apply(lambda x: ','.join([id for id in x]))
val_df.to_csv(folder_path + 'val_SASRec.csv', index = False, sep='|' )

In [None]:
train_df = pd.DataFrame(train_data)
train_df['hist'] = train_df['hist'].apply(lambda x: ','.join([id for id in x]))
train_df.to_csv(folder_path + 'train_SASRec.csv', index = False, sep='|' )

# 2. Load Preprcessed Data

### 2.1 Create tf.data.Dataset for training and evaluation

If you want to put padding before each row (rather than after), then you can't currently do that with RaggedTensor.to_tensor. But you can write a fairly [simple function](https://github.com/tensorflow/tensorflow/issues/34793) to do it:


In [None]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=batch_size, maxlen = maxlen):

    def left_pad_2d_ragged(rt, width = maxlen):
        #rt = rt[-width:]  # Truncate rows to have at most `width` items
        pad_row_lengths = width - rt.row_lengths()
        pad_values = tf.zeros([(width * rt.nrows()) - tf.size(rt, tf.int64)], rt.dtype)
        padding = tf.RaggedTensor.from_row_lengths(pad_values, pad_row_lengths)
        return tf.concat([padding, rt], axis=1).to_tensor()

    def process(features):
        features["hist"] = tf.strings.split(features["hist"], ",")#.to_tensor(shape = [None, maxlen])
        features['hist'] = left_pad_2d_ragged(features['hist'], width = maxlen)

        return features


    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        num_epochs=1,
        header=True,
        field_delim="|",
        shuffle=shuffle,
        column_defaults = ['string', 'string'],
        shuffle_buffer_size=100*batch_size
    ).map(process)

    return dataset

In [None]:
train_dataset = get_dataset_from_csv(folder_path + 'train_SASRec.csv',
                                     shuffle=True, batch_size=batch_size)
val_dataset = get_dataset_from_csv(folder_path + 'val_SASRec.csv',
                                     shuffle=True, batch_size=negative_example +1)
test_dataset = get_dataset_from_csv(folder_path + 'test_SASRec.csv',
                                     shuffle=True, batch_size=negative_example +1)
#drop the last batch
#train_dataset = train_dataset.rebatch(batch_size, drop_remainder=True)
#val_dataset = val_dataset.rebatch(negative_example +1, drop_remainder=True)
#test_dataset = test_dataset.rebatch(negative_example +1, drop_remainder=True)

In [None]:
#Test
for sample_data in train_dataset.take(1):#.as_numpy_iterator():
    break
for sample_val in val_dataset.take(1):#.as_numpy_iterator():
    break
print(sample_data['hist'].shape)
print(sample_data['pos_id'].shape)

print(sample_val['hist'].shape)
print(sample_val['pos_id'].shape)

sample_data['hist']

(4096, 50)
(4096,)
(101, 50)
(101,)


<tf.Tensor: shape=(4096, 50), dtype=string, numpy=
array([[b'61132', b'102716', b'2193', ..., b'1377', b'3897', b'3793'],
       [b'8371', b'33162', b'2231', ..., b'6281', b'1385', b'5065'],
       [b'4432', b'1945', b'2612', ..., b'1080', b'1261', b'3424'],
       ...,
       [b'1717', b'2717', b'2420', ..., b'4974', b'3440', b'3821'],
       [b'', b'', b'', ..., b'2028', b'4022', b'5989'],
       [b'4975', b'1587', b'39', ..., b'3263', b'2269', b'8376']],
      dtype=object)>

In [None]:
# choose a row which is padded on the left
# use this row to check remaining functions
id = 0

### 2.2. MovieId Vocabulary

In [None]:
movies = pd.read_csv(
    folder_path + 'movie.csv',
    dtype = {'movieId': str},
    usecols = ['movieId'],
)

movieId_vocab =  list(movies.movieId.unique())

#3. Query Model

In [None]:
from tensorflow.keras import Sequential, layers, callbacks, utils

###3.1 Embedding and positional embedding layer
we will use the same movies Id lookup and embedding layers in both query model and item model.

In [None]:
movieId_lookup = layers.StringLookup(
    vocabulary = movieId_vocab,
    )

# about masking https://www.tensorflow.org/guide/keras/masking_and_padding
movieId_embedding = layers.Embedding(
    input_dim = len(movieId_vocab) + 1,
    output_dim = embedding_dimension,
    mask_zero = True,
)

In [None]:
#test
movieId_lookup(sample_data['hist'])
movieId_lookup(sample_val['hist'])

<tf.Tensor: shape=(101, 50), dtype=int64, numpy=
array([[22918, 18741, 13917, ..., 18035, 17507, 22582],
       [ 3670,  1778,   897, ...,   892,  3461,  2516],
       [ 1874,  1878,  2111, ...,  2614,   365,   374],
       ...,
       [    0,     0,     0, ...,   346,   587,   377],
       [    0,     0,     0, ...,   588,   538,  7250],
       [    0,     0,     0, ...,  2643,  3264,  4170]])>

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, movieId_lookup, movieId_embedding, maxlen, embedding_dimension):
        super().__init__()
        self.maxlen = maxlen
        self.movieId_lookup = movieId_lookup
        self.movieId_embedding = movieId_embedding
        self.pos_embedding = layers.Embedding(input_dim = maxlen, output_dim = embedding_dimension)

    def call(self, x):
        x = self.movieId_lookup(x)
        x = self.movieId_embedding(x)
        mask = x._keras_mask
        mask1 = tf.expand_dims(mask, axis = 2) # [:, :, None]
        mask2 = tf.expand_dims(mask, axis = 1) # [:, None, :]
        attention_mask = mask1 & mask2 #[:,:,:]
        mask = tf.expand_dims(tf.cast(mask, tf.float32), axis = -1)
        x = x + tf.expand_dims(self.pos_embedding(tf.range(self.maxlen)), axis = 0)
        #x = x * mask
        return x, mask, attention_mask

In [None]:
#test the above function
embed_hist, mask, attention_mask = PositionalEmbedding(movieId_lookup, movieId_embedding, maxlen, embedding_dimension)(sample_data['hist'])
embed_hist_val, mask_val, attention_mask_val = PositionalEmbedding(movieId_lookup, movieId_embedding, maxlen, embedding_dimension)(sample_val['hist'])
print(embed_hist.shape)

print(mask.shape)

print(attention_mask.shape)
attention_mask[id]
#embed_hist * mask

(4096, 50, 64)
(4096, 50, 1)
(4096, 50, 50)


<tf.Tensor: shape=(50, 50), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])>

###3.2 Attention block

In [None]:
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, num_heads, embedding_dimension, dropout):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dimension)
        self.layernorm = layers.LayerNormalization()
        self.add = layers.Add()
        self.dropout = layers.Dropout(dropout)


    def call(self, x, attention_mask):
        att = self.mha(query = x, value = x, attention_mask = attention_mask)
        att = self.dropout(att)
        att = self.add([x, att])
        att = self.layernorm(att)
        return att

In [None]:
#test the above function
att = AttentionBlock(num_heads, embedding_dimension, dropout)(embed_hist * mask, attention_mask )
att[id,:,0]
AttentionBlock(num_heads, embedding_dimension, dropout)(embed_hist_val * mask_val, attention_mask_val ).shape

TensorShape([101, 50, 64])

###3.3 Feed Forward block

In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, ffn_hidden_unit, embedding_dimension, dropout):
        super().__init__()
        self.seq = Sequential([
            layers.Dense(ffn_hidden_unit, activation = 'relu'),
            layers.Dense(embedding_dimension),
            layers.Dropout(dropout)
        ])
        self.add = layers.Add()
        self.layernorm = layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layernorm(x)
        return x

In [None]:
#test the above function
ff = FeedForward(ffn_hidden_unit, embedding_dimension, dropout)(att)
print(ff.shape)
ff[id,:,0]

(4096, 50, 64)


<tf.Tensor: shape=(50,), dtype=float32, numpy=
array([-1.6552001 ,  1.4077022 , -1.4846429 ,  0.36595032,  0.37804344,
       -1.712226  ,  0.14214924,  0.3816294 ,  0.3750532 , -1.1381018 ,
        1.7527851 , -0.52342856, -0.55472463,  0.688848  , -2.1020875 ,
       -1.7041838 , -0.8114975 , -1.6758058 ,  1.5546358 , -0.78213733,
       -0.43383166,  1.2408466 , -0.41690263, -0.3560957 , -1.1664238 ,
        0.0383225 ,  0.55275244,  0.61813945,  0.5143788 , -1.4343904 ,
        0.51297015, -1.2772094 , -0.49824008, -0.7967505 , -0.09998196,
       -0.898908  , -0.08261719, -0.86967444,  0.6260975 , -2.2544744 ,
       -0.04179642, -0.7617358 , -0.91603804, -0.49991947, -0.6970702 ,
       -0.38423356, -1.0491843 ,  0.05154865, -2.5700066 ,  0.5543088 ],
      dtype=float32)>

###2.4 Query Model

In [None]:
class QueryModel(tf.keras.Model):
    def __init__(self, blocks, movieId_lookup, movieId_embedding, maxlen, embedding_dimension,
                 num_heads, dropout, ffn_hidden_unit ):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(movieId_lookup, movieId_embedding,
                                                 maxlen, embedding_dimension)
        self.attention_blocks = [AttentionBlock(num_heads, embedding_dimension,
                                                dropout) for i in range(blocks)]
        self.ff_blocks = [FeedForward(ffn_hidden_unit, embedding_dimension, dropout) for i in range(blocks)]

    def call(self, inputs):
        emb, mask, attention_mask = self.pos_embedding(inputs['hist'])
        h = emb * mask
        for attention_block, ff_block in zip(self.attention_blocks, self.ff_blocks):
            h = attention_block(h, attention_mask)
            h = ff_block(h)
        return h[:,-1, :]


In [None]:
#test the above function
output = QueryModel(blocks,movieId_lookup, movieId_embedding,
           maxlen, embedding_dimension,num_heads,dropout,ffn_hidden_unit)(sample_data)
print(output.shape)

output_val = QueryModel(blocks,movieId_lookup, movieId_embedding,
           maxlen, embedding_dimension,num_heads,dropout,ffn_hidden_unit)(sample_val)
output_val.shape

(4096, 64)


TensorShape([101, 64])

#4. Candidate Model

In [None]:
class CandidateModel(tf.keras.Model):
    def __init__(self, movieId_lookup, movieId_embedding):
        super().__init__()
        self.movieId_lookup = movieId_lookup
        self.movieId_embedding = movieId_embedding

    def call(self,inputs):
        lk = self.movieId_lookup(inputs['pos_id'])
        emb = self.movieId_embedding(lk)
        return emb

In [None]:
#test the above function
print(CandidateModel(movieId_lookup, movieId_embedding)(sample_data).shape)
print(CandidateModel(movieId_lookup, movieId_embedding)(sample_val).shape)


(4096, 64)
(101, 64)


#4. Full Model

In [None]:
movies_ds = tf.data.Dataset.from_tensor_slices({'pos_id': movies.movieId})

candidate_model = CandidateModel(movieId_lookup, movieId_embedding)
movies_ds.batch(128).map(candidate_model)

<_MapDataset element_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None)>

In [None]:
class AttentionModel(tfrs.models.Model):

    def __init__(self, blocks, movieId_lookup, movieId_embedding, maxlen, embedding_dimension,
                 num_heads, dropout, ffn_hidden_unit):
        super().__init__()
        self.query_model = QueryModel(blocks,movieId_lookup, movieId_embedding,
                                      maxlen, embedding_dimension,num_heads,dropout,ffn_hidden_unit)
        self.candidate_model = CandidateModel(movieId_lookup, movieId_embedding)
        self.task = tfrs.tasks.Retrieval(
            loss = tf.keras.losses.CategoricalCrossentropy(
                from_logits = True,
                reduction = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
            #metrics = tfrs.metrics.FactorizedTopK(
            #    candidates = movies_ds.batch(128).map(self.candidate_model)),
            batch_metrics=[tf.keras.metrics.TopKCategoricalAccuracy(k=10, name = 'batch_top_10_categorical_accuracy')],
            #remove_accidental_hits = True,
            #num_hard_negatives=100,
        )

    def compute_loss(self, inputs, training: bool = False):
        query_emb = self.query_model(inputs)
        movie_emb = self.candidate_model(inputs)
        #candidate_ids = inputs['pos_id']
        return self.task(query_emb, movie_emb,
                         compute_metrics = not training,
                         compute_batch_metrics = True,
                         #candidate_ids = candidate_ids
                         )


In [None]:
model = AttentionModel(blocks, movieId_lookup, movieId_embedding, maxlen, embedding_dimension,
                 num_heads, dropout, ffn_hidden_unit)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate))

In [None]:
# test full model
print(model.compute_loss(sample_data))
print(model.compute_loss(sample_val))

tf.Tensor(8.343054, shape=(), dtype=float32)
tf.Tensor(4.6482296, shape=(), dtype=float32)


#5. Fitting and evaluating

In [None]:
prefetch_train = train_dataset.prefetch(tf.data.AUTOTUNE)
prefetch_val = val_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
model_filepath = '/content/drive/MyDrive/Movie Recommendations with Movielens/models/SASRec'
checkpoint = callbacks.ModelCheckpoint(model_filepath + '/checkpoint',
                                       monitor = 'batch_top_10_categorical_accuracy',
                                       save_best_only = True,
                                       save_weights_only = True,
                                       mode = 'max'
                                       )
earlyStopping = callbacks.EarlyStopping(patience = 5,
                                        restore_best_weights = True,
                                        monitor = 'val_batch_top_10_categorical_accuracy',
                                        mode = 'max')
csv_logger = callbacks.CSVLogger(model_filepath +'/training.log')

In [None]:
history = model.fit(
    prefetch_train,
    validation_data = prefetch_val,
    epochs = epoch,
    verbose = 1,
    callbacks = [checkpoint, earlyStopping, csv_logger],
    validation_freq = 1
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

In [None]:
model.save_weights(filepath = model_filepath + '/model_weight', save_format = 'tf')

In [None]:
model.load_weights(model_filepath + '/checkpoint')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ff5d9b2bb20>

Finally, we can evaluate our model on the test set:

In [None]:
model.evaluate(test_dataset, return_dict=True)



{'batch_top_10_categorical_accuracy': 0.719079315662384,
 'loss': 2.660205602645874,
 'regularization_loss': 0,
 'total_loss': 2.660205602645874}