# Imports

In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd

In [2]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")



In [3]:
ratings

<PrefetchDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'raw_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_gender': TensorSpec(shape=(), dtype=tf.bool, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None), 'user_zip_code': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [4]:
len(ratings)

100000

In [5]:
len(movies)

1682

# Data as a pandas dataframe

In [6]:
ratings_df = tfds.as_dataframe(ratings)
ratings_df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,[7],b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4,b'doctor',4.0,b'53211'
1,25.0,"[4, 14]",b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5,b'entertainment',2.0,b'80525'
2,18.0,[4],b'412',"b'Very Brady Sequel, A (1996)'",24.0,882075110,True,b'301',17,b'student',4.0,b'55439'
3,50.0,"[5, 7]",b'56',b'Pulp Fiction (1994)',50.0,883326919,True,b'60',4,b'healthcare',4.0,b'06472'
4,50.0,"[10, 16]",b'895',b'Scream 2 (1997)',55.0,891409199,True,b'197',18,b'technician',3.0,b'75094'


In [7]:
ratings_df.shape

(100000, 12)

In [8]:
# Did someone watch the movie more than twice?
len(ratings_df[['movie_id', 'user_id']].drop_duplicates())

100000

In [9]:
ratings_df.user_id.nunique()

943

In [10]:
ratings_df.movie_title.nunique()

1664

In [11]:
movie_df = tfds.as_dataframe(movies)
movie_df.head()

Unnamed: 0,movie_genres,movie_id,movie_title
0,[4],b'1681',b'You So Crazy (1994)'
1,"[4, 7]",b'1457',b'Love Is All There Is (1996)'
2,"[1, 3]",b'500',b'Fly Away Home (1996)'
3,[0],b'838',b'In the Line of Duty 2 (1987)'
4,[7],b'1648',"b'Niagara, Niagara (1997)'"


In [12]:
movie_df.shape

(1682, 3)

In [13]:
movie_df.movie_title.nunique()

1664

In [14]:
# Checking the intersection of the movies in the rating data and movie data
len(set(ratings_df.movie_title.unique()).intersection(movie_df.movie_title.unique()))
# This means all the movies in the rating data are also in the movies data

1664

# Data as tensor dataset

In [15]:
for i in ratings.take(1).as_numpy_iterator():
    pprint.pprint(i)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7], dtype=int64),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [16]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movie_genres': array([4], dtype=int64),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [17]:
# There are so many feature, we are taking just the ids and title for this model
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [18]:
print(tfds.as_dataframe(ratings.take(5)))

                                 movie_title user_id
0  b"One Flew Over the Cuckoo's Nest (1975)"  b'138'
1                b'Strictly Ballroom (1992)'   b'92'
2             b'Very Brady Sequel, A (1996)'  b'301'
3                     b'Pulp Fiction (1994)'   b'60'
4                         b'Scream 2 (1997)'  b'197'


In [19]:
for i in ratings.take(5).as_numpy_iterator():
    print(i)

{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'user_id': b'138'}
{'movie_title': b'Strictly Ballroom (1992)', 'user_id': b'92'}
{'movie_title': b'Very Brady Sequel, A (1996)', 'user_id': b'301'}
{'movie_title': b'Pulp Fiction (1994)', 'user_id': b'60'}
{'movie_title': b'Scream 2 (1997)', 'user_id': b'197'}


In [20]:
# There are so many features in the movie data, we are taking just the movie title
movies = movies.map(lambda x: x["movie_title"])

In [21]:
print(tfds.as_dataframe(movies.take(5)))

                                  
0           b'You So Crazy (1994)'
1   b'Love Is All There Is (1996)'
2          b'Fly Away Home (1996)'
3  b'In the Line of Duty 2 (1987)'
4       b'Niagara, Niagara (1997)'


In [22]:
for i in movies.take(5).as_numpy_iterator():
    print(i)

b'You So Crazy (1994)'
b'Love Is All There Is (1996)'
b'Fly Away Home (1996)'
b'In the Line of Duty 2 (1987)'
b'Niagara, Niagara (1997)'


# Train test split

In [23]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [24]:
for i in train.take(10).as_numpy_iterator():
    print(i)

{'movie_title': b'Postman, The (1997)', 'user_id': b'681'}
{'movie_title': b'Clueless (1995)', 'user_id': b'442'}
{'movie_title': b'Maltese Falcon, The (1941)', 'user_id': b'932'}
{'movie_title': b'His Girl Friday (1940)', 'user_id': b'506'}
{'movie_title': b'Quiz Show (1994)', 'user_id': b'18'}
{'movie_title': b"Carlito's Way (1993)", 'user_id': b'551'}
{'movie_title': b'Primal Fear (1996)', 'user_id': b'12'}
{'movie_title': b'Aladdin (1992)', 'user_id': b'901'}
{'movie_title': b'Glengarry Glen Ross (1992)', 'user_id': b'835'}
{'movie_title': b'Titanic (1997)', 'user_id': b'284'}


In [25]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

In [26]:
for i in movie_titles.as_numpy_iterator():
    print(i.shape)

(1000,)
(682,)


In [27]:
for i in user_ids.as_numpy_iterator():
    print(i.shape)

(100000,)


In [28]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [29]:
unique_movie_titles[:10]

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

In [30]:
unique_user_ids[:10]

array([b'1', b'10', b'100', b'101', b'102', b'103', b'104', b'105',
       b'106', b'107'], dtype=object)

# Model

In [31]:
embedding_dimension = 32
user_model = tf.keras.Sequential([
    # The string lookup will create an integer mapping and the embedding layer
    # creates the embedding for the mapped values
  tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])
metrics = tfrs.metrics.FactorizedTopK(candidates=movies.batch(128).map(movie_model))
task = tfrs.tasks.Retrieval(metrics=metrics)

In [32]:
unique_user_ids[:2]


array([b'1', b'10'], dtype=object)

In [39]:
user_model

<keras.engine.sequential.Sequential at 0x16ca8a5faf0>

In [33]:
user_model([b'1', b'10'])





<tf.Tensor: shape=(2, 32), dtype=float32, numpy=
array([[ 0.00379433,  0.00056654,  0.00664406, -0.03980237,  0.0390699 ,
        -0.02822758,  0.03486596,  0.00069901,  0.00497658, -0.03264948,
         0.02877188,  0.02793241,  0.01245091,  0.00208951, -0.01581891,
         0.01734592,  0.03905628,  0.01192083, -0.02066573, -0.00754   ,
         0.01801969,  0.04346007,  0.0169647 ,  0.01606007,  0.01982814,
        -0.04007641,  0.045028  , -0.049063  ,  0.03786261,  0.04120934,
         0.01567051, -0.02888723],
       [ 0.03562683, -0.029091  ,  0.03816745,  0.02418743, -0.00315785,
         0.00137063,  0.03066394, -0.00517576, -0.0085967 , -0.007099  ,
        -0.04975113,  0.03433314, -0.04863119, -0.03806108, -0.01709783,
         0.00340132,  0.02570032,  0.0201246 ,  0.02533622,  0.00705235,
         0.01892534,  0.00901524, -0.0156159 ,  0.02489221, -0.00033083,
        -0.01286156, -0.0441285 , -0.00646639, -0.01215779,  0.02597013,
         0.01914329, -0.00487567]], dtyp

In [42]:
unique_movie_titles[8:10]

array([b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

In [45]:
movie_model(np.array([b'3 Ninjas: High Noon At Mega Mountain (1998)', b'39 Steps, The (1935)']))

<tf.Tensor: shape=(2, 32), dtype=float32, numpy=
array([[ 0.01385022, -0.03299248, -0.04028846,  0.02150265, -0.01857678,
         0.00690241,  0.00418769,  0.01069348,  0.01680839,  0.03166931,
         0.02299112,  0.04204333,  0.02162323,  0.01234107,  0.01231024,
         0.04155768, -0.00274254, -0.00430496,  0.00604724,  0.02175163,
        -0.02467217, -0.01149321, -0.02650201,  0.00302569, -0.03158353,
         0.04523635, -0.01143268,  0.03836718, -0.03051233,  0.00969934,
         0.03019334,  0.02059979],
       [-0.02631067,  0.02493635,  0.00367935, -0.00511577,  0.03692731,
        -0.03604013, -0.00451303,  0.03340856,  0.01659877, -0.01881517,
        -0.0214715 ,  0.04189299,  0.03730367,  0.02300391, -0.02409807,
        -0.01853317, -0.03263661, -0.03852485, -0.02925391, -0.00732665,
         0.03955239,  0.01303552, -0.03172016,  0.04908109, -0.0495234 ,
        -0.03134354, -0.04181329,  0.00167622,  0.02357925,  0.04646683,
         0.03159193, -0.02466836]], dtyp

In [46]:
for i in movies.batch(128).map(movie_model).take(1):
    print(i)

tf.Tensor(
[[ 0.00441785  0.01647634 -0.04737489 ... -0.01886891  0.00674813
  -0.04528711]
 [ 0.00918128  0.02095627 -0.02590123 ... -0.04480494  0.03908117
   0.00560932]
 [ 0.0310288  -0.00725682 -0.01423266 ... -0.01554262  0.03286183
   0.01596502]
 ...
 [ 0.01025158  0.04506296 -0.03001736 ... -0.00074236  0.01737906
  -0.00986023]
 [-0.01743602  0.02750853 -0.02143759 ...  0.03770575  0.04725311
  -0.02091433]
 [ 0.03956307 -0.00539433 -0.03634753 ... -0.04950354  0.01510545
  -0.02846858]], shape=(128, 32), dtype=float32)


In [47]:
for i in movies.batch(128).map(movie_model):
    print(i.shape)

(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(128, 32)
(18, 32)


In [48]:
class model(tf.keras.Model):
  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # Set up a gradient tape to record gradients.
    with tf.GradientTape() as tape:

      # Loss computation.
      user_embeddings = self.user_model(features["user_id"])
      positive_movie_embeddings = self.movie_model(features["movie_title"])
      loss = self.task(
        user_embeddings, 
        positive_movie_embeddings, 
        # compute_metrics=False  You can set it to False to speed up the training
      )

      # Handle regularization losses as well.
      regularization_loss = sum(self.losses)

      total_loss = loss + regularization_loss

    gradients = tape.gradient(total_loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics
  
  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Loss computation.
    user_embeddings = self.user_model(features["user_id"])
    positive_movie_embeddings = self.movie_model(features["movie_title"])
    loss = self.task(user_embeddings, positive_movie_embeddings)

    # Handle regularization losses as well.
    regularization_loss = sum(self.losses)

    total_loss = loss + regularization_loss

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

In [49]:
model = model(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [50]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [51]:
for i in cached_train:
    print(i["user_id"].shape, i["movie_title"].shape)

(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(8192,) (8192,)
(6272,) (6272,)


In [52]:
for i in cached_test:
    print(i["user_id"].shape, i["movie_title"].shape)

(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(3616,) (3616,)


In [None]:
model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2be151244c0>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0003499999875202775,
 'factorized_top_k/top_5_categorical_accuracy': 0.0035000001080334187,
 'factorized_top_k/top_10_categorical_accuracy': 0.008949999697506428,
 'factorized_top_k/top_50_categorical_accuracy': 0.08389999717473984,
 'factorized_top_k/top_100_categorical_accuracy': 0.18404999375343323,
 'loss': 28767.9921875,
 'regularization_loss': 0,
 'total_loss': 28767.9921875}

# Indexing

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

In [None]:
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x25cdbd5c730>

In [None]:
index(tf.constant(["42"]))

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[3.271916 , 3.0733037, 2.8836622, 2.813118 , 2.8008885, 2.7494378,
         2.7240582, 2.6685703, 2.6594362, 2.6328905]], dtype=float32)>,
 <tf.Tensor: shape=(1, 10), dtype=string, numpy=
 array([[b"Kid in King Arthur's Court, A (1995)", b'Rent-a-Kid (1995)',
         b'Bridges of Madison County, The (1995)',
         b'Little Big League (1994)', b'Affair to Remember, An (1957)',
         b'Old Yeller (1957)', b"Preacher's Wife, The (1996)",
         b'Unforgettable (1996)', b'Forget Paris (1995)',
         b'Miracle on 34th Street (1994)']], dtype=object)>)

In [None]:
for i in train.take(1).as_numpy_iterator():
    print(i)

{'movie_title': b'Postman, The (1997)', 'user_id': b'681'}


In [None]:
for i in train.take(1).as_numpy_iterator():
    print((i['user_id'].decode('UTF-8')))

<class 'str'>
