In [2]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import pandas as pd
import seaborn as sns
import surprise
import tensorflow as tf
import tensorflow_recommenders as tfrs

import tensorflow_datasets as tfds

# Load Data

In [3]:
# Only have user ids for people who have watched and rated a movie
# have all movie ids
ratings_raw = tfds.load("movie_lens/100k-ratings", split="train")
# Features of all the available movies.
movies_raw = tfds.load("movie_lens/100k-movies", split="train")



In [6]:
# We see that movie_id + user_id = unique
ratings_df = tfds.as_dataframe(ratings_raw)
ratings_df.groupby(['movie_id','user_id']).size().reset_index(name='count').sort_values(by='count', ascending=False)

Unnamed: 0,movie_id,user_id,count
0,b'1',b'1',1
66650,b'513',b'201',1
66672,b'513',b'406',1
66671,b'513',b'405',1
66670,b'513',b'397',1
...,...,...,...
33331,b'24',b'717',1
33330,b'24',b'715',1
33329,b'24',b'706',1
33328,b'24',b'70',1


# Retrieval

In [37]:
ratings_raw.element_spec

{'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None),
 'movie_genres': TensorSpec(shape=(None,), dtype=tf.int64, name=None),
 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None),
 'raw_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None),
 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None),
 'user_gender': TensorSpec(shape=(), dtype=tf.bool, name=None),
 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'user_occupation_label': TensorSpec(shape=(), dtype=tf.int64, name=None),
 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None),
 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None),
 'user_zip_code': TensorSpec(shape=(), dtype=tf.string, name=None)}

In [2]:

ratings = ratings_raw.map(
    lambda x: {"movie_title": x["movie_title"], "user_id": x["user_id"],}
)
movies = movies_raw.map(lambda x: x["movie_title"])

tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)


movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print("number of unique titles: ", len(unique_movie_titles))
print("number of unique users: ", len(unique_user_ids))

# look at top 10
unique_user_ids[:10]



number of unique titles:  1664
number of unique users:  943


array([b'1', b'10', b'100', b'101', b'102', b'103', b'104', b'105',
       b'106', b'107'], dtype=object)

In [5]:
#where someone has engage, get users ids and content ids -> unique user ids

#get all content ids -> content

embedding_dimension = 32
user_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
        # We add an additional embedding to account for unknown tokens.
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
    ]
)

movie_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension),
    ]
)

metrics = tfrs.metrics.FactorizedTopK(candidates=movies.batch(128).map(movie_model))

task = tfrs.tasks.Retrieval(metrics=metrics)

In [12]:
class MovielensModel(tfrs.Model):
    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(
        self, features: Dict[Text, tf.Tensor], training=False
    ) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        print('features: ', features)
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_movie_embeddings = self.movie_model(features["movie_title"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_movie_embeddings)

In [13]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [14]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()


In [15]:
# fit call compute loss
model.fit(cached_train, epochs=3)


Epoch 1/3
features:  {'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>}
features:  {'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>}
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa1b43b0fa0>

In [11]:
model.evaluate(cached_test, return_dict=True)




{'factorized_top_k/top_1_categorical_accuracy': 0.0010499999625608325,
 'factorized_top_k/top_5_categorical_accuracy': 0.009549999609589577,
 'factorized_top_k/top_10_categorical_accuracy': 0.02239999920129776,
 'factorized_top_k/top_50_categorical_accuracy': 0.1246500015258789,
 'factorized_top_k/top_100_categorical_accuracy': 0.23250000178813934,
 'loss': 28244.7734375,
 'regularization_loss': 0,
 'total_loss': 28244.7734375}

# Ranking

In [18]:
# use user_rating
ratings = ratings_raw.map(
    lambda x: {
        "movie_title": x["movie_title"],
        "user_id": x["user_id"],
        "user_rating": x["user_rating"],
    }
)

In [19]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
print(shuffled)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

<ShuffleDataset element_spec={'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>


In [20]:
shuffled.element_spec

{'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None),
 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}

In [21]:
movie_titles = ratings.batch(1_000_000).map(lambda x: x["movie_title"])
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [25]:
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # Compute embeddings for users.
        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_user_ids, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_user_ids) + 1, embedding_dimension
                ),
            ]
        )

        # Compute embeddings for movies.
        self.movie_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_movie_titles, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_movie_titles) + 1, embedding_dimension
                ),
            ]
        )

        # Compute predictions.
        self.ratings = tf.keras.Sequential(
            [
                # Learn multiple dense layers.
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                # Make rating predictions in the final layer.
                tf.keras.layers.Dense(1),
            ]
        )

    def call(self, inputs):
        print('inputs: ', inputs)

        user_id, movie_title = inputs

        user_embedding = self.user_embeddings(user_id)
        movie_embedding = self.movie_embeddings(movie_title)

        return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

In [26]:
RankingModel()((["42"], ["One Flew Over the Cuckoo's Nest (1975)"]))

inputs:  (['42'], ["One Flew Over the Cuckoo's Nest (1975)"])








<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.0221471]], dtype=float32)>

In [27]:
TASK = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )

In [31]:
class MovielensModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer = TASK

    def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
        print('inside call features: ', features)
        return self.ranking_model((features["user_id"], features["movie_title"]))

    def compute_loss(
        self, features: Dict[Text, tf.Tensor], training=False
    ) -> tf.Tensor:
        print('inside loss: ',features)
        labels = features.pop("user_rating")

        rating_predictions = self(features)

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)

In [32]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [36]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
print('cached:',cached_train.element_spec)

cached: {'movie_title': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}


In [34]:
model.fit(cached_train, epochs=5)

Epoch 1/5
inside loss:  {'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'user_rating': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=float32>}
inside call features:  {'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>}
inputs:  (<tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>)
inside loss:  {'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'user_rating': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=float32>}
inside call features:  {'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>}
inputs:  (<tf.Tensor 'IteratorGetNext:1' shape

<keras.callbacks.History at 0x7fa15a2de580>