In [None]:
! pip install -q tensorflow-recommenders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pprint

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from collections import defaultdict
from tqdm import tqdm
import random

In [None]:
folder_path = "/content/drive/MyDrive/Movie Recommendations with Movielens/data/"

In [None]:
maxlen = 50
batch_size = 4096
embedding_dimension = 32

num_heads = 1
ffn_hidden_unit = 64
dropout = 0.5
use_causal_mask = False
blocks = 2
learning_rate = 0.001
epoch = 100


In this project, I impliment the model proposed in paper [Self-Attentive Sequential Recommendaiton](https://arxiv.org/pdf/1808.09781.pdf).

The dataset is collected from the movie-recommendation service MovieLens. Created by 138,493 users, the Movielens data set includes over 20 million ratings and 460,000+ tags for 27,278 movies. 

Kaggle data set: [MovieLens 20M Dataset](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?resource=download)


# 1. Data Preprocess


### 1.1 Filter
I only include movies with at least 5 ratings(watchs). And I only include recodes with rating >= 2, in other words, only include watch history that the user enjoy.  

We end up with about 18M records.

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Movie Recommendations with Movielens/rating.csv',
                      #usecols = ['userId', 'movieId', 'timestamp'],
                      #dtype = {'movieId': str, 'userId': str},
                      #nrows = 1000000
                    )

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
ratings['movie_count'] = ratings.groupby('movieId')['movieId'].transform('count')
# 
ratings = ratings[ratings.movie_count >= 5]
# only include recodes with rating >= 2 
# in other words, only include watch history that the user enjoy
ratings = ratings[ratings.rating >= 2]
ratings = ratings.sort_values(by=['userId', 'timestamp'])

In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18786848 entries, 20 to 19999916
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   timestamp    object 
 4   movie_count  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 860.0+ MB


In [None]:
#folder_path = "/content/drive/MyDrive/Movie Recommendations with Movielens/"
ratings.to_csv(folder_path + 'rating_filtered.csv', index = False)

### 1.2 Train and Test Split
We split the historical sequence for each user in to three parts:
1.   the most recent action for testing
2.   the second most recent action fro validation
3.   all remaining actions for training



In [None]:
#folder_path = "/content/drive/MyDrive/Movie Recommendations with Movielens/"
ratings = pd.read_csv(folder_path + 'rating_filtered.csv',
                      usecols = ['userId', 'movieId'],
                      dtype = {'movieId': str, 'userId': str},
                      )

In [None]:
train_data, val_data, test_data = defaultdict(list), defaultdict(list), defaultdict(list)

In [None]:
for userId, df in tqdm(ratings[['userId', 'movieId']].groupby('userId')):
    pos_list = df['movieId'].tolist()

    for i in range(1, len(pos_list)):
        hist_i = pos_list[max(0,i-maxlen):i]
        if i == len(pos_list) - 1:
            test_data['hist'].append(hist_i)
            test_data['pos_id'].append(pos_list[i])       
        elif i == len(pos_list) - 2:
            val_data['hist'].append(hist_i)
            val_data['pos_id'].append(pos_list[i])
        else:
        #if i < len(pos_list) - 2:
            train_data['hist'].append(hist_i)
            train_data['pos_id'].append(pos_list[i])
            
       


100%|██████████| 138469/138469 [01:23<00:00, 1659.09it/s]


In [None]:
print(train_data['hist'][0:5])
print(train_data['pos_id'][:5])

[['924'], ['924', '919'], ['924', '919', '2683'], ['924', '919', '2683', '1584'], ['924', '919', '2683', '1584', '1079']]
['919', '2683', '1584', '1079', '653']


In [None]:
test_df = pd.DataFrame(test_data)
test_df['hist'] = test_df['hist'].apply(lambda x: ','.join([id for id in x]))
test_df.to_csv(folder_path + 'test_SASRec.csv', index = False, sep='|' )


In [None]:
val_df = pd.DataFrame(val_data)
val_df['hist'] = val_df['hist'].apply(lambda x: ','.join([id for id in x]))
val_df.to_csv(folder_path + 'val_SASRec.csv', index = False, sep='|' )

In [None]:
train_df = pd.DataFrame(train_data)
train_df['hist'] = train_df['hist'].apply(lambda x: ','.join([id for id in x]))
train_df.to_csv(folder_path + 'train_SASRec.csv', index = False, sep='|' )

### 1.3 Create tf.data.Dataset for training and evaluation

If you want to put padding before each row (rather than after), then you can't currently do that with RaggedTensor.to_tensor. But you can write a fairly [simple function](https://github.com/tensorflow/tensorflow/issues/34793) to do it: 


In [None]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=batch_size, maxlen = maxlen):
        
    def left_pad_2d_ragged(rt, width = maxlen):
        #rt = rt[-width:]  # Truncate rows to have at most `width` items
        pad_row_lengths = width - rt.row_lengths()
        pad_values = tf.zeros([(width * rt.nrows()) - tf.size(rt, tf.int64)], rt.dtype)
        padding = tf.RaggedTensor.from_row_lengths(pad_values, pad_row_lengths)
        return tf.concat([padding, rt], axis=1).to_tensor()

    def process(features):
        features["hist"] = tf.strings.split(features["hist"], ",")#.to_tensor(shape = [None, maxlen])
        features['hist'] = left_pad_2d_ragged(features['hist'], width = maxlen)
    
        return features


    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        num_epochs=1,
        header=True,
        field_delim="|",
        shuffle=shuffle,
        column_defaults = ['string', 'string'],
    ).map(process)

    return dataset

In [None]:
train_dataset = get_dataset_from_csv(folder_path + 'train_SASRec.csv',
                                     shuffle=True, batch_size=batch_size)
val_dataset = get_dataset_from_csv(folder_path + 'val_SASRec.csv',
                                     shuffle=False, batch_size=batch_size)
test_dataset = get_dataset_from_csv(folder_path + 'test_SASRec.csv',
                                     shuffle=False, batch_size=batch_size)
#drop the last batch
train_dataset = train_dataset.rebatch(batch_size, drop_remainder=True)
val_dataset = val_dataset.rebatch(batch_size, drop_remainder=True)
test_dataset = test_dataset.rebatch(batch_size, drop_remainder=True)

In [None]:
for sample_data in train_dataset.take(1):#.as_numpy_iterator():
    break

print(sample_data['hist'].shape)
print(sample_data['pos_id'].shape)
    

(4096, 50)
(4096,)


In [None]:
sample_data['hist']

<tf.Tensor: shape=(4096, 50), dtype=string, numpy=
array([[b'2320', b'3638', b'2710', ..., b'1306', b'1573', b'1676'],
       [b'', b'', b'', ..., b'', b'', b'2355'],
       [b'', b'', b'', ..., b'349', b'356', b'595'],
       ...,
       [b'', b'', b'', ..., b'2959', b'60684', b'44191'],
       [b'922', b'6620', b'539', ..., b'1172', b'27721', b'39435'],
       [b'3994', b'4025', b'3952', ..., b'6', b'25', b'36']], dtype=object)>

In [None]:
# choose the row which is padded on the left
id = -3

### 1.4. MovieId Vocabulary

In [None]:
movies = pd.read_csv(
    folder_path + 'movie.csv',
    dtype = {'movieId': str},
    usecols = ['movieId'],
)

movieId_vocab =  list(movies.movieId.unique())

#2. Query Model

In [None]:
from tensorflow.keras import Sequential, layers, callbacks, utils

###2.1 Embedding and positional embedding layer
we will use the same movies Id lookup and embedding layers in both query model and item model.

In [None]:
movieId_lookup = layers.StringLookup(
    vocabulary = movieId_vocab,
    )

# about masking https://www.tensorflow.org/guide/keras/masking_and_padding
movieId_embedding = layers.Embedding(
    input_dim = len(movieId_vocab) + 1,
    output_dim = embedding_dimension,
    mask_zero = True,
)

In [None]:
movieId_lookup(sample_data['hist'])

<tf.Tensor: shape=(4096, 50), dtype=int64, numpy=
array([[ 2236,  3548,  2625, ...,  1278,  1523,  1620],
       [    0,     0,     0, ...,     0,     0,  2271],
       [    0,     0,     0, ...,   346,   353,   590],
       ...,
       [    0,     0,     0, ...,  2874, 12857, 10887],
       [  906,  6511,   536, ...,  1150,  9446, 10523],
       [ 3901,  3932,  3859, ...,     6,    25,    36]])>

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, movieId_lookup, movieId_embedding, maxlen, embedding_dimension):
        super().__init__()
        self.maxlen = maxlen
        self.movieId_lookup = movieId_lookup
        self.movieId_embedding = movieId_embedding
        self.pos_embedding = layers.Embedding(input_dim = maxlen, output_dim = embedding_dimension)

    def call(self, x):
        x = self.movieId_lookup(x)
        x = self.movieId_embedding(x)
        mask = x._keras_mask
        mask1 = tf.expand_dims(mask, axis = 2) # [:, :, None]
        mask2 = tf.expand_dims(mask, axis = 1) # [:, None, :]
        attention_mask = mask1 & mask2 #[:,:,:]
        mask = tf.expand_dims(tf.cast(mask, tf.float32), axis = -1)
        x = x + tf.expand_dims(self.pos_embedding(tf.range(self.maxlen)), axis = 0)
        #x = x * mask
        return x, mask, attention_mask

In [None]:
embed_hist, mask, attention_mask = PositionalEmbedding(movieId_lookup, movieId_embedding, maxlen, embedding_dimension)(sample_data['hist'])

print(embed_hist.shape)

print(mask.shape)

print(attention_mask.shape)
attention_mask[id]
#embed_hist * mask

(4096, 50, 32)
(4096, 50, 1)
(4096, 50, 50)


<tf.Tensor: shape=(50, 50), dtype=bool, numpy=
array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True]])>

###2.2 Attention block

In [None]:
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, num_heads, embedding_dimension, dropout):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dimension)
        self.layernorm = layers.LayerNormalization()
        self.add = layers.Add()
        self.dropout = layers.Dropout(dropout)
        #self.attention_mask = attention_mask
    
    def call(self, x):
        att = self.mha(query = x, value = x, attention_mask = attention_mask)
        att = self.dropout(att)
        att = self.add([x, att])
        att = self.layernorm(att)
        return att

In [None]:
att = AttentionBlock(num_heads, embedding_dimension, dropout)(embed_hist * mask )
att[id,:,0]

<tf.Tensor: shape=(50,), dtype=float32, numpy=
array([ 0.02971154,  0.02971154,  0.02971154,  0.02971154,  0.02971154,
        0.02971154,  0.02971154,  0.02971154,  0.02971154,  0.02971154,
        0.02971154,  0.02971154,  0.02971154,  0.02971154,  0.02971154,
        0.02971154,  0.02971154,  0.02971154,  0.02971154,  0.02971154,
        0.02971154,  0.02971154,  0.02971154,  0.02971154,  0.02971154,
        0.02971154,  0.02971154,  0.02971154,  0.02971154,  0.02971154,
        0.8400101 ,  1.3401276 , -0.1672948 , -0.30863595,  0.89511096,
       -0.6563084 ,  0.7871158 ,  0.44849175, -0.6753941 , -0.22885332,
        0.22419433,  0.93494   ,  0.41355702, -0.06021744,  0.8014932 ,
        0.4370722 , -1.3358665 ,  1.0686882 , -0.5148345 ,  0.7846851 ],
      dtype=float32)>

###2.3 Feed Forward block

In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, ffn_hidden_unit, embedding_dimension, dropout):
        super().__init__()
        self.seq = Sequential([
            layers.Dense(ffn_hidden_unit, activation = 'relu'),
            layers.Dense(embedding_dimension),
            layers.Dropout(dropout)
        ])
        self.add = layers.Add()
        self.layernorm = layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layernorm(x) 
        return x

In [None]:
ff = FeedForward(ffn_hidden_unit, embedding_dimension, dropout)(att)
print(ff.shape)
ff[id,:,0]

(4096, 50, 32)


<tf.Tensor: shape=(50,), dtype=float32, numpy=
array([ 0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,
        0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,
        0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,
        0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,
        0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,
        0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,  0.2980761 ,
        0.7508224 ,  0.7438079 ,  0.1404477 , -0.6993661 ,  0.64823467,
       -0.702047  ,  0.46841174,  0.6191246 , -0.8973172 , -0.10774227,
        0.09063606,  1.1348494 ,  1.0650682 , -0.805975  ,  0.46284685,
        0.9035339 , -1.8424516 ,  1.0945375 , -1.296737  , -0.3008513 ],
      dtype=float32)>

###2.4 Query Model 

In [None]:
class QueryModel(tf.keras.Model):
    def __init__(self, blocks, movieId_lookup, movieId_embedding, maxlen, embedding_dimension,
                 num_heads, dropout, ffn_hidden_unit ):
        super().__init__()
        self.pos_embedding = PositionalEmbedding(movieId_lookup, movieId_embedding, 
                                                 maxlen, embedding_dimension)
        self.attention_blocks = [AttentionBlock(num_heads, embedding_dimension, 
                                                dropout) for i in range(blocks)]
        self.ff_blocks = [FeedForward(ffn_hidden_unit, embedding_dimension, dropout) for i in range(blocks)]
    
    def call(self, inputs):
        emb, mask, attention_mask = self.pos_embedding(inputs['hist'])
        h = emb * mask
        for attention_block, ff_block in zip(self.attention_blocks, self.ff_blocks):
            h = attention_block(h)
            h = ff_block(h)
        return h[:,-1, :]


In [None]:
output = QueryModel(blocks,movieId_lookup, movieId_embedding, 
           maxlen, embedding_dimension,num_heads,dropout,ffn_hidden_unit)(sample_data)
output.shape

TensorShape([4096, 32])

#3. Candidate Model

In [None]:
class CandidateModel(tf.keras.Model):
    def __init__(self, movieId_lookup, movieId_embedding):
        super().__init__()
        self.movieId_lookup = movieId_lookup
        self.movieId_embedding = movieId_embedding

    def call(self,inputs):
        lk = self.movieId_lookup(inputs['pos_id'])
        emb = self.movieId_embedding(lk)
        return emb    

In [None]:
CandidateModel(movieId_lookup, movieId_embedding)(sample_data).shape

TensorShape([4096, 32])

# Full Model

In [None]:
movies_ds = tf.data.Dataset.from_tensor_slices({'pos_id': movies.movieId})

candidate_model = CandidateModel(movieId_lookup, movieId_embedding)
movies_ds.batch(128).map(candidate_model)

<_MapDataset element_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None)>

In [None]:
class AttentionModel(tfrs.models.Model):

    def __init__(self, blocks, movieId_lookup, movieId_embedding, maxlen, embedding_dimension,
                 num_heads, dropout, ffn_hidden_unit):
        super().__init__()
        self.query_model = QueryModel(blocks,movieId_lookup, movieId_embedding, 
                                      maxlen, embedding_dimension,num_heads,dropout,ffn_hidden_unit)
        self.candidate_model = CandidateModel(movieId_lookup, movieId_embedding)
        self.task = tfrs.tasks.Retrieval(
            metrics = tfrs.metrics.FactorizedTopK(
                candidates = movies_ds.batch(128).map(self.candidate_model)
            )
        )

    def compute_loss(self, inputs, training: bool = False):
        query_emb = self.query_model(inputs)
        movie_emb = self.candidate_model(inputs)
        return self.task(query_emb, movie_emb, compute_metrics = not training)
    

#4. Fitting and evaluating

In [None]:
model = AttentionModel(blocks, movieId_lookup, movieId_embedding, maxlen, embedding_dimension,
                 num_heads, dropout, ffn_hidden_unit)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate))

In [None]:
model.compute_loss(sample_data)

<tf.Tensor: shape=(), dtype=float32, numpy=34118.805>

In [None]:
cached_train = train_dataset.cache()

In [None]:
history = model.fit(train_dataset, epochs = 3, verbose = 1,
          #callbacks = [checkpoint],
          )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model.save_weights(filepath = model_filepath + '/model_weight_3e', save_format = 'tf')

Finally, we can evaluate our model on the test set:

In [None]:
model.evaluate(test_dataset, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.012865470722317696,
 'factorized_top_k/top_5_categorical_accuracy': 0.06466027349233627,
 'factorized_top_k/top_10_categorical_accuracy': 0.10723692178726196,
 'factorized_top_k/top_50_categorical_accuracy': 0.27001214027404785,
 'factorized_top_k/top_100_categorical_accuracy': 0.3667806088924408,
 'loss': 27449.552734375,
 'regularization_loss': 0,
 'total_loss': 27449.552734375}