<a href="https://colab.research.google.com/github/vis7/recommendation_system/blob/main/tensorflow_recommender_tutorial/movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# installation
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 85 kB 2.4 MB/s 
[K     |████████████████████████████████| 4.0 MB 4.2 MB/s 
[?25h

In [2]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

Below we are loading two diffrent part of the same dataset so we can get info with any of the part (ratings or movies)

In [3]:
# loading data
ratings, ratings_info = tfds.load('movielens/100k-ratings', split='train', with_info=True)
movies, movies_info = tfds.load('movielens/100k-movies', split='train', with_info=True)

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/100000 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m
[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/1682 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.0. Subsequent calls will reuse this data.[0m


In [4]:
# printing basic details about dataset
print('movies info')
print(movies_info)

movies info
tfds.core.DatasetInfo(
    name='movielens',
    full_name='movielens/100k-movies/0.1.0',
    description="""
    This dataset contains a set of movie ratings from the MovieLens website, a movie
    recommendation service. This dataset was collected and maintained by [GroupLens]
    (https://grouplens.org/), a research group at the University of Minnesota. There
    are 5 versions included: "25m", "latest-small", "100k", "1m", "20m". In all
    datasets, the movies data and ratings data are joined on "movieId". The 25m
    dataset, latest-small dataset, and 20m dataset contain only movie data and
    rating data. The 1m dataset and 100k dataset contain demographic data in
    addition to movie and rating data.
    
    - "25m": This is the latest stable version of the MovieLens dataset. It is
    recommended for research purposes.
    - "latest-small": This is a small subset of the latest version of the MovieLens
    dataset. It is changed and updated over time by GroupLens

In [20]:
# showing some examples from the dataset
# when passing info then it show categorical variable alogn with it's respective integer representation
tfds.as_dataframe(ratings.take(6), ratings_info)

Unnamed: 0,movie_title,user_id
0,"b""One Flew Over the Cuckoo's Nest (1975)""",b'138'
1,b'Strictly Ballroom (1992)',b'92'
2,"b'Very Brady Sequel, A (1996)'",b'301'
3,b'Pulp Fiction (1994)',b'60'
4,b'Scream 2 (1997)',b'197'
5,b'Crash (1996)',b'601'


In [6]:
tfds.as_dataframe(movies.take(4), movies_info)

Unnamed: 0,movie_genres,movie_id,movie_title
0,4 (Comedy),b'1681',b'You So Crazy (1994)'
1,4 (Comedy) 7 (Drama),b'1457',b'Love Is All There Is (1996)'
2,1 (Adventure) 3 (Children),b'500',b'Fly Away Home (1996)'
3,0 (Action),b'838',b'In the Line of Duty 2 (1987)'


In [7]:
# selecting basic feature
ratings = ratings.map(lambda x:{
    'movie_title': x['movie_title'],
    'user_id': x['user_id']
})

movies = movies.map(lambda x: x['movie_title'])

After selecting only necessory features

In [8]:
tfds.as_dataframe(ratings.take(4), ratings_info)

Unnamed: 0,movie_title,user_id
0,"b""One Flew Over the Cuckoo's Nest (1975)""",b'138'
1,b'Strictly Ballroom (1992)',b'92'
2,"b'Very Brady Sequel, A (1996)'",b'301'
3,b'Pulp Fiction (1994)',b'60'


In [9]:
tfds.as_dataframe(movies.take(4), movies_info)

Unnamed: 0,Unnamed: 1
0,b'You So Crazy (1994)'
1,b'Love Is All There Is (1996)'
2,b'Fly Away Home (1996)'
3,b'In the Line of Duty 2 (1987)'


In [10]:
# creating integer indices from string lookup
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x:x['user_id']))

movie_title_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_title_vocabulary.adapt(movies)

In [11]:
user_ids_vocabulary.get_vocabulary()[:5]

['[UNK]', '405', '655', '13', '450']

In [12]:
movie_title_vocabulary.get_vocabulary()[:5]

['[UNK]',
 "Ulee's Gold (1997)",
 'That Darn Cat! (1997)',
 'Substance of Fire, The (1996)',
 'Sliding Doors (1998)']

In [13]:
# using vocabulary and getting original records
test_movies = ['That Darn Cat! (1997)', 'Substance of Fire, The (1996)']
movie_title_vocabulary(test_movies) # you will get respective indexes

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([2, 3])>

In [14]:
# definging a model
class MovieLensModel(tfrs.Model):
  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval
  ):
    super().__init__()
    self.user_model = user_model
    self.movie_model = movie_model
    self.task = task
  
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    user_embeddings = self.user_model(features['user_id'])
    movie_embeddings = self.movie_model(features['movie_title'])
    return self.task(user_embeddings, movie_embeddings)

In [15]:
# creating sequntial model with embeddings
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])

movie_model = tf.keras.Sequential([
    movie_title_vocabulary,
    tf.keras.layers.Embedding(movie_title_vocabulary.vocabulary_size(), 64)
])

task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)

In [16]:
# creating retrieval model
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [17]:
# train for 3 epochs
model.fit(ratings.batch(4096), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f51d1c92c10>

In [18]:
# creating index from trained model so that we can use it to make prediction
# use brute-force search to set up retrival using the trained representations
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movie_model(title)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f51d1eb6450>

In [21]:
# getting some recommendations
_, titles = index(np.array(["53"]))
print(f"top k recommendations for user 42: {titles[0,:3]}")

top k recommendations for user 42: [b'White Squall (1996)' b'River Wild, The (1994)'
 b'Fifth Element, The (1997)']
