In [None]:
! pip install -q tensorflow-recommenders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pprint

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In this project, I create a recommendation system using data collected from the movie-recommendation service MovieLens. Created by 138,493 users, the Movielens data set includes over 20 million ratings and 460,000+ tags for 27,278 movies. 

Kaggle data set: [MovieLens 20M Dataset](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?resource=download)

# Transform the movie ratings data into sequences
First, let's sort the the ratings data using the timestamp, and then group the movie_id values and the rating values by user_id.

The output DataFrame will have a record for each user_id, with two ordered lists (sorted by rating datetime): the movies they have rated, and their ratings of these movies.

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Movie Recommendations with Movielens/rating.csv')

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
ratings_group = ratings.sort_values(by=["timestamp"]).groupby("userId")

In [None]:
ratings_data = pd.DataFrame(
    data={
        "userId": list(ratings_group.groups.keys()),
        "movieIds": list(ratings_group.movieId.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.timestamp.apply(list)),
    }
)

In [None]:
ratings_data.head()

Unnamed: 0,userId,movieIds,ratings,timestamps
0,1,"[924, 919, 2683, 1584, 1079, 653, 2959, 337, 1...","[3.5, 3.5, 3.5, 3.5, 4.0, 3.0, 4.0, 3.5, 3.0, ...","[2004-09-10 03:06:38, 2004-09-10 03:07:01, 200..."
1,2,"[469, 62, 1974, 1121, 2951, 3159, 1210, 1356, ...","[3.0, 5.0, 5.0, 3.0, 4.0, 3.0, 5.0, 5.0, 4.0, ...","[2000-11-21 15:29:58, 2000-11-21 15:29:58, 200..."
2,3,"[2118, 1721, 1188, 2011, 589, 2710, 2857, 2676...","[5.0, 4.0, 2.0, 3.0, 4.0, 5.0, 3.0, 1.0, 1.0, ...","[1999-12-11 07:25:08, 1999-12-11 07:25:08, 199..."
3,4,"[380, 165, 329, 10, 356, 480, 454, 589, 367, 1...","[3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 5.0, 4.0, 3.0, ...","[1996-08-24 09:27:05, 1996-08-24 09:27:32, 199..."
4,5,"[780, 736, 62, 141, 648, 17, 608, 1073, 708, 1...","[5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 2.0, 3.0, ...","[1996-12-25 15:15:35, 1996-12-25 15:15:35, 199..."


Now, let's split the movie_ids list into a set of sequences of a fixed length. We do the same for the ratings. Set the sequence_length variable to change the length of the input sequence to the model. You can also change the step_size to control the number of sequences to generate for each user.

In [None]:
sequence_length = 4
step_size = 2

def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.movieIds = ratings_data.movieIds.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.timestamps = ratings_data.timestamps.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)



In [None]:
ratings_data.head()

Unnamed: 0,userId,movieIds,ratings,timestamps
0,1,"[[924, 919, 2683, 1584], [2683, 1584, 1079, 65...","[[3.5, 3.5, 3.5, 3.5], [3.5, 3.5, 4.0, 3.0], [...","[[2004-09-10 03:06:38, 2004-09-10 03:07:01, 20..."
1,2,"[[469, 62, 1974, 1121], [1974, 1121, 2951, 315...","[[3.0, 5.0, 5.0, 3.0], [5.0, 3.0, 4.0, 3.0], [...","[[2000-11-21 15:29:58, 2000-11-21 15:29:58, 20..."
2,3,"[[2118, 1721, 1188, 2011], [1188, 2011, 589, 2...","[[5.0, 4.0, 2.0, 3.0], [2.0, 3.0, 4.0, 5.0], [...","[[1999-12-11 07:25:08, 1999-12-11 07:25:08, 19..."
3,4,"[[380, 165, 329, 10], [329, 10, 356, 480], [35...","[[3.0, 3.0, 3.0, 4.0], [3.0, 4.0, 4.0, 4.0], [...","[[1996-08-24 09:27:05, 1996-08-24 09:27:32, 19..."
4,5,"[[780, 736, 62, 141], [62, 141, 648, 17], [648...","[[5.0, 5.0, 5.0, 5.0], [5.0, 5.0, 3.0, 3.0], [...","[[1996-12-25 15:15:35, 1996-12-25 15:15:35, 19..."


After that, we process the output to have each sequence in a separate records in the DataFrame. In addition, we join the user features with the ratings data.

In [None]:
ratings_data_movies = ratings_data[["userId", "movieIds"]].explode("movieIds", ignore_index=True)

ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)

#retrive the timestamp of the target movie
ratings_data_timestamps = ratings_data[["timestamps"]].explode("timestamps", ignore_index=True)
ratings_data_timestamp = ratings_data_timestamps["timestamps"].apply(lambda x: x[-1])

ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating, ratings_data_timestamp], axis=1)

ratings_data_transformed.movieIds = ratings_data_transformed.movieIds.apply(lambda x: ",".join([str(v) for v in x]))
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(lambda x: ",".join([str(v) for v in x]))

ratings_data_transformed.rename(
    columns={"movieIds": "sequence_movieIds", "ratings": "sequence_ratings"},
    inplace=True,
)

In [None]:
ratings_data_transformed.head()

Unnamed: 0,userId,sequence_movieIds,sequence_ratings,timestamps
0,1,92491926831584,"3.5,3.5,3.5,3.5",2004-09-10 03:07:36
1,1,268315841079653,"3.5,3.5,4.0,3.0",2004-09-10 03:08:11
2,1,10796532959337,"4.0,3.0,4.0,3.5",2004-09-10 03:08:29
3,1,295933713043996,"4.0,3.5,3.0,4.0",2004-09-10 03:08:47
4,1,13043996151112,"3.0,4.0,4.0,3.5",2004-09-10 03:09:00


With sequence_length of 4 and step_size of 2, we end up with 9,966,408 sequences.

Finally, we split the data into training and testing splits. I'd like to consider the time factor; that is, I want to define the last 7 movie the user read as the testing set, and see whether the recommendation list can hit that movie. After the split, there are around 9.7% of the data to be set as testing set.

In [None]:
#folder_path = "/content/drive/MyDrive/Movie Recommendations with Movielens/"

#ratings_data_transformed.to_csv(folder_path + 'ratings_data_transformed.csv', index=False, sep="|")

#ratings_data_transformed = pd.read_csv(folder_path + 'ratings_data_transformed.csv', sep = '|')

In [None]:
test_data = ratings_data_transformed.sort_values('timestamps').groupby('userId').tail(7)
train_data = ratings_data_transformed.drop(test_data.index)

train_data.to_csv("/content/drive/MyDrive/Movie Recommendations with Movielens/ratings_seq_train_data.csv", index=False, sep="|")
test_data.to_csv("/content/drive/MyDrive/Movie Recommendations with Movielens/ratings_seq_test_data.csv", index=False, sep="|")

# Define metadata

In [None]:
movies = pd.read_csv(
    '/content/drive/MyDrive/Movie Recommendations with Movielens/movie.csv',
    dtype = {'movieId': str},
    usecols = ['movieId'],
)


In [None]:
movies_ds = tf.data.Dataset.from_tensor_slices(movies.movieId)

In [None]:
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    #"userId": list(test.userId.unique()),
    "movieId": list(movies.movieId.unique()),
}

# Create tf.data.Dataset for training and evaluation

In [None]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        movieIds_string = features["sequence_movieIds"]
        sequence_movieIds = tf.strings.split(movieIds_string, ",").to_tensor()

        # The last movie id in the sequence is the target movie.
        #target = sequence_movieIds[:, -1]
        features["target_movieId"] = sequence_movieIds[:, -1]
        features["sequence_movieIds"] = sequence_movieIds[:, :-1]

        ratings_string = features["sequence_ratings"]
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        features['target_rating'] = sequence_ratings[:, -1]
        features["sequence_ratings"] = sequence_ratings[:, :-1]

        return features#, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        num_epochs=1,
        header=True,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset

In [None]:
# Read the training data.
train_dataset = get_dataset_from_csv("/content/drive/MyDrive/Movie Recommendations with Movielens/ratings_seq_train_data.csv",
                                     shuffle=True, batch_size=265)

# Read the test data.
test_dataset = get_dataset_from_csv("/content/drive/MyDrive/Movie Recommendations with Movielens/ratings_seq_test_data.csv",
                                    batch_size=265)

In [None]:
train_dataset  = train_dataset.map(lambda x: {
    'sequence_movieIds': x['sequence_movieIds'],
    'target_movieId': x['target_movieId']
})

test_dataset = test_dataset.map(lambda x: {
    'sequence_movieIds': x['sequence_movieIds'],
    'target_movieId': x['target_movieId']
})

# Implementing a sequential model
Here we are still going to use the two-tower architecture. Specificially, we use the query tower with a Gated Recurrent Unit (GRU) layer to encode the sequence of historical movies, and keep the same candidate tower for the candidate movie.

In [None]:
from tensorflow.keras import Sequential, layers, callbacks

In [None]:
embedding_dimension = 32

query_model = Sequential([
    layers.StringLookup(
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY['movieId'],
        mask_token = None),
    layers.Embedding(
        len(CATEGORICAL_FEATURES_WITH_VOCABULARY['movieId']) + 1,
        embedding_dimension),
    layers.GRU(embedding_dimension),
])

candidate_model = Sequential([
    layers.StringLookup(
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY['movieId'],
        mask_token = None),
    layers.Embedding(
        len(CATEGORICAL_FEATURES_WITH_VOCABULARY['movieId']) + 1,
        embedding_dimension),
    layers.Reshape([embedding_dimension]),
])

In [None]:
#train_dataset.take(1).map(lambda x: candidate_model(x['target_movieId']))
for x in train_dataset.take(1).map(lambda x: query_model(x['sequence_movieIds'])):
    print(x.shape)

(265, 32)


In [None]:
for x in train_dataset.take(1).map(lambda x: candidate_model(x['target_movieId'])):
    print(x.shape)

(265, 32)


## Metircs and loss
we can use the tfrs.metrics.FactorizedTopK metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

The next component is the loss used to train our model. TFRS has several loss layers and tasks to make this easy.

In this instance, we'll make use of the Retrieval task object: a convenience wrapper that bundles together the loss function and metric computation:

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates = movies_ds.batch(128).map(candidate_model)
)

task = tfrs.tasks.Retrieval(metrics = metrics)

##The full model
We can now put it all together into a model. TFRS exposes a base model class (tfrs.models.Model) which streamlines building models: all we need to do is to set up the components in the __init__ method, and implement the compute_loss method, taking in the raw features and returning a loss value.

In [None]:
class SequentialModel(tfrs.Model):

    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model

        self._task = task

    def compute_loss(self, features, training = False):
        query_embedding = self._query_model(features['sequence_movieIds'])
        candidate_embedding = self._candidate_model(features['target_movieId'])

        return self._task(query_embedding, candidate_embedding, compute_metrics = not training)


# Fitting and evaluating

We can now compile, train and evaluate our sequential retrieval model.

In [None]:
model = SequentialModel(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
# create callbacks
model_filepath = '/content/drive/MyDrive/Movie Recommendations with Movielens/models/sequential'
checkpoint = callbacks.ModelCheckpoint(model_filepath, save_best_only = True)
earlyStopping = callbacks.EarlyStopping(patience = 5, restore_best_weights = True)

In [None]:
cached_train = train_dataset.rebatch(2048).cache()
cached_test = test_dataset.rebatch(512).cache()

In [None]:
model.fit(cached_train,
          #validation_data = cached_test,
          epochs=5,
          verbose = 1,
          #batch_size = 256,
          #callbacks = [checkpoint, earlyStopping],
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9240a90df0>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.013475668616592884,
 'factorized_top_k/top_5_categorical_accuracy': 0.06916285306215286,
 'factorized_top_k/top_10_categorical_accuracy': 0.11793375760316849,
 'factorized_top_k/top_50_categorical_accuracy': 0.28825902938842773,
 'factorized_top_k/top_100_categorical_accuracy': 0.37468421459198,
 'loss': 1318.538818359375,
 'regularization_loss': 0,
 'total_loss': 1318.538818359375}

# Reference
[Recommending movies: retrieval using a sequential model](https://www.tensorflow.org/recommenders/examples/sequential_retrieval)

[A Transformer-based recommendation system](https://keras.io/examples/structured_data/movielens_recommendations_transformers/)