# Package 

In [34]:
import tensorflow as tf
import tensorflow_datasets as tfd
# import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
from typing import Dict, Tuple, Text
import numpy as np
import tempfile
import os

# Config 

In [2]:
batch_size = 100

# Data process 

## Raw data 

In [3]:
ratings = tfd.load(name='movielens/100k-ratings', split='train')
movies = tfd.load(name='movielens/100k-movies', split='train')

In [4]:
ratings = ratings.map( lambda x: {
    "user_id": x['user_id']
    , 'movie_id': x['movie_id']
#     , 'user_rating': x['user_rating']
}
)

In [5]:
list(ratings.take(2).as_numpy_iterator())

[{'user_id': b'138', 'movie_id': b'357'},
 {'user_id': b'92', 'movie_id': b'709'}]

In [6]:
list(movies.take(2).as_numpy_iterator())

[{'movie_genres': array([4]),
  'movie_id': b'1681',
  'movie_title': b'You So Crazy (1994)'},
 {'movie_genres': array([4, 7]),
  'movie_id': b'1457',
  'movie_title': b'Love Is All There Is (1996)'}]

## Id map

In [8]:
movie_ids = movies.map(lambda x: x['movie_id'])
user_ids = ratings.map(lambda x: x['user_id'])


movie_titles = movie_ids.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b'1', b'10', b'100', b'1000', b'1001', b'1002', b'1003', b'1004',
       b'1005', b'1006'], dtype=object)

## train data

In [9]:
# def get_feature_label(x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
#     label = x.pop('user_rating')
#     return x, label

# ds_train = ds_train.map(lambda x: get_feature_label(x))

In [10]:
# batch_size

In [11]:
ds_train = ratings.apply(
    tf.data.experimental.dense_to_ragged_batch(batch_size=32))

In [12]:
list(ds_train.take(2).as_numpy_iterator())

[{'user_id': array([b'138', b'92', b'301', b'60', b'197', b'601', b'710', b'833',
         b'916', b'940', b'611', b'707', b'699', b'16', b'314', b'217',
         b'276', b'510', b'757', b'881', b'880', b'797', b'188', b'246',
         b'445', b'91', b'372', b'891', b'71', b'279', b'688', b'59'],
        dtype=object),
  'movie_id': array([b'357', b'709', b'412', b'56', b'895', b'325', b'95', b'92',
         b'425', b'271', b'355', b'712', b'825', b'240', b'1150', b'684',
         b'124', b'294', b'265', b'465', b'823', b'243', b'392', b'202',
         b'433', b'182', b'56', b'116', b'285', b'638', b'309', b'491'],
        dtype=object)},
 {'user_id': array([b'56', b'854', b'615', b'639', b'699', b'195', b'676', b'279',
         b'634', b'505', b'617', b'666', b'416', b'655', b'293', b'350',
         b'404', b'28', b'428', b'733', b'354', b'486', b'409', b'582',
         b'253', b'354', b'663', b'669', b'535', b'560', b'7', b'223'],
        dtype=object),
  'movie_id': array([b'117', b

# Model Definition 

## Old model 

In [17]:
class MovieLenModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 24
        self.user_embedding = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=unique_user_ids, mask_token=None),
          # We add an additional embedding to account for unknown tokens.
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        self.movie_embedding = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=unique_movie_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])        
        self.task= tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
            movie_ids.batch(128).map(self.movie_embedding)
          )
        )
    
    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:
        user_emb = self.user_embedding(features['user_id'])
        movie_emb =  self.movie_embedding(features['movie_id'])
        print(f"user: {user_emb.shape}")
        print(f"movie: {movie_emb.shape}")
        return self.task(user_emb, movie_emb)

In [13]:


# class MovielensModel(tfrs.Model):

#   def __init__(self, user_vocab, movie_vocab):
#     super().__init__()
#     embedding_dimension = 32
#     self.user_vocab = user_vocab
#     self.movie_vocab = movie_vocab
#     user_model = tf.keras.Sequential([
# #       tf.keras.layers.StringLookup(
# #           vocabulary=unique_user_ids, mask_token=None),
#       # We add an additional embedding to account for unknown tokens.
#       tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
#     ])
#     movie_model = tf.keras.Sequential([
# #       tf.keras.layers.StringLookup(
# #           vocabulary=unique_movie_titles, mask_token=None),
#       tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
#     ])
#     metrics = tfrs.metrics.FactorizedTopK(
#       candidates=movie_ids.batch(128).map(movie_model)
#     )
    
#     task = tfrs.tasks.Retrieval(
#       metrics=metrics
#     )
#     self.movie_model: tf.keras.Model = movie_model
#     self.user_model: tf.keras.Model = user_model
#     self.task: tf.keras.layers.Layer = task

#   def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
#     # We pick out the user features and pass them into the user model.
#     user_embeddings = self.user_model(self.user_vocab(features["user_id"]))
#     # And pick out the movie features and pass them into the movie model,
#     # getting embeddings back.
#     positive_movie_embeddings = self.movie_model(self.movie_vocab(features["movie_id"]))
#     print(f"user: {user_embeddings.shape}")
#     print(f"movie: {positive_movie_embeddings.shape}")
#     # The task computes the loss and the metrics.
#     return self.task(user_embeddings, positive_movie_embeddings)

## New model 

# Training 

In [21]:
model1 = MovieLenModel(
)

optimizor = tf.keras.optimizers.Adagrad(learning_rate=0.1)
model1.compile(
               optimizer=optimizor
              )

model1.fit(ds_train, epochs=1)

user: (None, 24)
movie: (None, 24)
user: (None, 24)
movie: (None, 24)


<keras.callbacks.History at 0x7fd3ad0d6da0>

# Prediction 

In [22]:
model1.user_embedding

<keras.engine.sequential.Sequential at 0x7fd3ad25d4a8>

In [23]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model1.user_embedding)

In [24]:
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movie_ids.batch(100), movie_ids.batch(100).map(model1.movie_embedding)))
)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fd3ad4134a8>

In [28]:
# Get recommendations.
scores, items = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :5]}")

Recommendations for user 42: [b'63' b'102' b'420' b'560' b'623']


In [30]:
scores

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[1.4527458, 1.4333963, 1.4160137, 1.4080179, 1.3786912, 1.3743391,
        1.3707472, 1.3624487, 1.3570994, 1.3432596]], dtype=float32)>

##  Save and deploy -> accurate search

In [None]:
tmp = '../../data/'

In [37]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, "model")

    # Save the index.
    tf.saved_model.save(index, path)

    # Load it back; can also be done in TensorFlow Serving.
    loaded = tf.saved_model.load(path)

    # Pass a user id in, get top predicted movie titles back.
    scores, titles = loaded(["42"])

    print(f"Recommendations: {titles[0][:3]}")
    print(scores)



INFO:tensorflow:Assets written to: /var/folders/60/6qphmx_d7x7_11vpj8524vf40000gn/T/tmpust1brl6/model/assets


INFO:tensorflow:Assets written to: /var/folders/60/6qphmx_d7x7_11vpj8524vf40000gn/T/tmpust1brl6/model/assets


Recommendations: [b'63' b'102' b'420']
tf.Tensor(
[[1.4527458 1.4333963 1.4160137 1.4080179 1.3786912 1.3743391 1.3707472
  1.3624487 1.3570994 1.3432596]], shape=(1, 10), dtype=float32)


## Save & deploy -> approximate search 

In [39]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model1.user_embedding)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((movie_ids.batch(100), movie_ids.batch(100).map(model1.movie_embedding)))
)

ImportError: The scann library is not present. Please install it using `pip install scann` to use the ScaNN layer.