# MovieLens example
This example is based on the TFRS movie retrieval example you can find here: https://www.tensorflow.org/recommenders/examples/basic_retrieval.


In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import pprint

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from tensorflow.keras.layers import Dense
from tensorflow.keras import Model

from tf_tabular.builder import InputBuilder
from tf_tabular.utils import get_vocab
from .movielens_model import MovielensModel

In [6]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

## View dataset examples

In [None]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


2024-04-22 11:13:21.455280: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [None]:
for x in movies.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


2024-04-22 11:13:21.561915: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Preprocessing

In [7]:
def compute_sampling_probability(all_titles):
    unique_movie_titles, movie_counts = np.unique(all_titles, return_counts=True)
    total_count = np.sum(movie_counts)
    normalized_counts = movie_counts / total_count
    sampling_dict = {}
    for i, key in enumerate(unique_movie_titles):
        sampling_dict[key] = normalized_counts[i]
    probs = np.array([sampling_dict[key] for key in all_titles], dtype=np.float32)
    return probs, unique_movie_titles

def preprocess_ratings(ratings):
    ratings = ratings.map(lambda x: {
        "movie_title": x["movie_title"],
        "movie_genres": x["movie_genres"],
        "user_id": x["user_id"],
    })

    user_ids = ratings.map(lambda x: x["user_id"]).batch(10_000)
    unique_user_ids = np.unique(np.concatenate(list(user_ids)))

    all_titles = ratings.map(lambda x: x["movie_title"]).batch(10_000)
    all_titles = np.concatenate(list(all_titles))

    probs, unique_movie_titles = compute_sampling_probability(all_titles)

    probs = tf.data.Dataset.from_tensor_slices(probs)
    ratings = tf.data.Dataset.zip(ratings, probs).map(lambda x, y: dict(x, **{"sampling_prob": y}))
    return ratings, unique_user_ids, unique_movie_titles


def preprocess_movies(movies):
    movies = movies.map(lambda x: {"movie_title": x["movie_title"],
                                "movie_genres": x["movie_genres"]
                                })

    genres = movies.map(lambda x: x["movie_genres"])
    unique_movie_genres = np.unique(np.concatenate(list(genres)))
    return movies, unique_movie_genres


In [9]:
ratings, unique_user_ids, unique_movie_titles = preprocess_ratings(ratings)
movies, unique_movie_genres = preprocess_movies(movies)

### Shuffle and split dataset

In [11]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)


## Build model using tf_tabular

In [13]:
vocabs = {"movie_title": unique_movie_titles,
          "movie_genres": unique_movie_genres}

embedding_dims = {"movie_title": 32,
                  "movie_genres": 32}

In [None]:
def build_model():
    input_builder = InputBuilder()
    input_builder.add_inputs_list(categoricals=["movie_title", "movie_genres"],
                                  vocabs=vocabs,
                                  multi_hots=["movie_genres"],
                                  embedding_dims=embedding_dims)
    inputs, output = input_builder.build_input_layers()
    x = Dense(32, activation=None)(output)
    return Model(inputs=inputs, outputs=x)

movie_model = build_model()

We build a simple user model

In [14]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
])


And now combine both into the two tower MovielensModel

In [15]:
model = MovielensModel(user_model, movie_model)
model.prepare_task(movies)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.003))




Cache the training and test data

In [16]:
cached_train = train.shuffle(100_000).ragged_batch(8192).cache()
cached_test = test.ragged_batch(4096).cache()


In [18]:
model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x295cc0070>

### Evaluation

In [19]:
model.evaluate(cached_test, return_dict=True)



{'auc': 0.5012891292572021,
 'recall': 0.005200000014156103,
 'factk/top_1_categorical_accuracy': 0.007350000087171793,
 'factk/top_5_categorical_accuracy': 0.030649999156594276,
 'factk/top_100_categorical_accuracy': 0.34375,
 'loss': 8.058255195617676,
 'regularization_loss': 0,
 'total_loss': 8.058255195617676}