# Basic Info 

1. inbatch negative sampling
2. movies that have scores are positive samples; sampling negative samples

# Package 

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfd
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
from typing import Dict, Tuple, Text
import numpy as np
import tempfile
import os

# Config 

In [2]:
batch_size = 100

# Data process 

## Raw data 

In [3]:
raw_ratings = tfd.load(name='movielens/100k-ratings', split='train')
movies = tfd.load(name='movielens/100k-movies', split='train')

In [4]:
ratings = raw_ratings.map( lambda x: {
    "user_id": x['user_id']
    , 'movie_id': x['movie_id']
    , 'user_rating': x['user_rating']
    }
)

## Id map

In [5]:
movie_ids = movies.map(lambda x: x['movie_id'])
user_ids = ratings.map(lambda x: x['user_id'])


movie_titles = movie_ids.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b'1', b'10', b'100', b'1000', b'1001', b'1002', b'1003', b'1004',
       b'1005', b'1006'], dtype=object)

# Single retrieve task  

## train data

In [17]:
ds_train = ratings.map( lambda x: {
        "user_id": x['user_id']
        , 'movie_id': x['movie_id']
#         , 'user_rating': x['user_rating']
        }
    ).apply(
        tf.data.experimental.dense_to_ragged_batch(batch_size=32))

In [18]:
list(ds_train.take(2).as_numpy_iterator())

[{'user_id': array([b'138', b'92', b'301', b'60', b'197', b'601', b'710', b'833',
         b'916', b'940', b'611', b'707', b'699', b'16', b'314', b'217',
         b'276', b'510', b'757', b'881', b'880', b'797', b'188', b'246',
         b'445', b'91', b'372', b'891', b'71', b'279', b'688', b'59'],
        dtype=object),
  'movie_id': array([b'357', b'709', b'412', b'56', b'895', b'325', b'95', b'92',
         b'425', b'271', b'355', b'712', b'825', b'240', b'1150', b'684',
         b'124', b'294', b'265', b'465', b'823', b'243', b'392', b'202',
         b'433', b'182', b'56', b'116', b'285', b'638', b'309', b'491'],
        dtype=object)},
 {'user_id': array([b'56', b'854', b'615', b'639', b'699', b'195', b'676', b'279',
         b'634', b'505', b'617', b'666', b'416', b'655', b'293', b'350',
         b'404', b'28', b'428', b'733', b'354', b'486', b'409', b'582',
         b'253', b'354', b'663', b'669', b'535', b'560', b'7', b'223'],
        dtype=object),
  'movie_id': array([b'117', b

## Model Definition 

In [51]:
class MovieLenModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 24
        self.user_embedding = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=unique_user_ids, mask_token=None),
          # We add an additional embedding to account for unknown tokens.
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        self.movie_embedding = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=unique_movie_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])        
        self.task= tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
            movie_ids.batch(128).map(self.movie_embedding)
          )
        )
    
    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:
        user_emb = self.user_embedding(features['user_id'])
        movie_emb =  self.movie_embedding(features['movie_id'])
        print(f"user: {user_emb.shape}")
        print(f"movie: {movie_emb.shape}")
        return self.task(user_emb, movie_emb)

## Training 

In [53]:
model1 = MovieLenModel(
)

optimizor = tf.keras.optimizers.Adagrad(learning_rate=0.1)
model1.compile(
               optimizer=optimizor
              )

model1.fit(ds_train, epochs=1)

user: (None, 24)
movie: (None, 24)
user: (None, 24)
movie: (None, 24)


<keras.callbacks.History at 0x7fd3ad390eb8>

## Prediction 

In [54]:
model1.user_embedding

<keras.engine.sequential.Sequential at 0x7fd3ad2a81d0>

In [55]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model1.user_embedding)

In [56]:
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movie_ids.batch(100), movie_ids.batch(100).map(model1.movie_embedding)))
)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fd3ad2a85c0>

In [57]:
# Get recommendations.
scores, items = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :5]}")

Recommendations for user 42: [b'63' b'102' b'420' b'560' b'623']


In [30]:
scores

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[1.4527458, 1.4333963, 1.4160137, 1.4080179, 1.3786912, 1.3743391,
        1.3707472, 1.3624487, 1.3570994, 1.3432596]], dtype=float32)>

##  Save and deploy -> accurate search

In [None]:
tmp = '../../data/'

In [37]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, "model")

    # Save the index.
    tf.saved_model.save(index, path)

    # Load it back; can also be done in TensorFlow Serving.
    loaded = tf.saved_model.load(path)

    # Pass a user id in, get top predicted movie titles back.
    scores, titles = loaded(["42"])

    print(f"Recommendations: {titles[0][:3]}")
    print(scores)



INFO:tensorflow:Assets written to: /var/folders/60/6qphmx_d7x7_11vpj8524vf40000gn/T/tmpust1brl6/model/assets


INFO:tensorflow:Assets written to: /var/folders/60/6qphmx_d7x7_11vpj8524vf40000gn/T/tmpust1brl6/model/assets


Recommendations: [b'63' b'102' b'420']
tf.Tensor(
[[1.4527458 1.4333963 1.4160137 1.4080179 1.3786912 1.3743391 1.3707472
  1.3624487 1.3570994 1.3432596]], shape=(1, 10), dtype=float32)


## Save & deploy -> approximate search 

In [58]:
# scann_index = tfrs.layers.factorized_top_k.ScaNN(model1.user_embedding)
# scann_index.index_from_dataset(
#   tf.data.Dataset.zip((movie_ids.batch(100), movie_ids.batch(100).map(model1.movie_embedding)))
# )

# Muti-task learning

## Train data 

In [41]:
ds_train_mutitask = ratings.map( lambda x: {
        "user_id": x['user_id']
        , 'movie_id': x['movie_id']
        , 'user_rating': x['user_rating']
        }
    ).apply(
        tf.data.experimental.dense_to_ragged_batch(batch_size=32))

In [42]:
train = ds_train_mutitask.take(1_000).cache()
test = ds_train_mutitask.skip(1_000).take(800).cache()

## Model definition 

In [43]:
tf.metrics.MeanSquaredError()

<keras.metrics.MeanSquaredError at 0x7ff7f0e809e8>

In [44]:
class MultitaskMovieLenModel(tfrs.Model):
    def __init__(self, retrieve_weight:float, ranking_weight:float):
        super().__init__()
        embedding_dimension = 24
        self.user_embedding = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=unique_user_ids, mask_token=None),
          # We add an additional embedding to account for unknown tokens.
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        self.movie_embedding = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=unique_movie_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])   
        
        self.ranking_model = tf.keras.Sequential([
            tf.keras.layers.Dense(units=256, activation='relu')
            , tf.keras.layers.Dense(units=128, activation='relu')
            , tf.keras.layers.Dense(units=1, activation=None)
        ])
        
        self.retrieve_task= tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
            movie_ids.batch(128).map(self.movie_embedding)
          )
        )
        self.ranking_task = tfrs.tasks.Ranking(
            metrics=[tf.metrics.MeanSquaredError()]
            , loss=tf.keras.losses.MeanSquaredError()
        )
        self.retrieve_weight = retrieve_weight
        self.ranking_weight = ranking_weight
    
    def call(self, features: Dict[str, tf.Tensor]):
        user_emb = self.user_embedding(features['user_id'])
        movie_emb =  self.movie_embedding(features['movie_id'])
        ranking_score = self.ranking_model(
            tf.concat([user_emb, movie_emb], axis=1)
        )
        return (
            user_emb,
            movie_emb,
            ranking_score
        )
    
    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:
        labels = features.pop('user_rating')
        user_emb, movie_emb, ranking_score = self(features)
        retrieve_loss = self.retrieve_task(user_emb, movie_emb)
        ranking_loss = self.ranking_task(labels=labels,predictions=ranking_score)
        return self.retrieve_weight*retrieve_loss + self.ranking_weight*ranking_loss

## training  & Eval

In [45]:
mutitaks_model = MultitaskMovieLenModel(retrieve_weight=1, ranking_weight=0)
optimizor = tf.keras.optimizers.Adagrad(learning_rate=0.1)

mutitaks_model.compile(optimizer=optimizor)

mutitaks_model.fit(train, epochs=1)



<keras.callbacks.History at 0x7ff7e71dd470>

In [47]:
mutitaks_model.evaluate(test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0021875000093132257,
 'factorized_top_k/top_5_categorical_accuracy': 0.013554687611758709,
 'factorized_top_k/top_10_categorical_accuracy': 0.029374999925494194,
 'factorized_top_k/top_50_categorical_accuracy': 0.14191406965255737,
 'factorized_top_k/top_100_categorical_accuracy': 0.24285155534744263,
 'mean_squared_error': 13.574176788330078,
 'loss': 108.55723571777344,
 'regularization_loss': 0,
 'total_loss': 108.55723571777344}

In [48]:
mutitaks_model = MultitaskMovieLenModel(retrieve_weight=0, ranking_weight=1)
optimizor = tf.keras.optimizers.Adagrad(learning_rate=0.1)

mutitaks_model.compile(optimizer=optimizor)

mutitaks_model.fit(train, epochs=1)
mutitaks_model.evaluate(test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0008203124743886292,
 'factorized_top_k/top_5_categorical_accuracy': 0.005898437462747097,
 'factorized_top_k/top_10_categorical_accuracy': 0.011601562611758709,
 'factorized_top_k/top_50_categorical_accuracy': 0.05000000074505806,
 'factorized_top_k/top_100_categorical_accuracy': 0.08902344107627869,
 'mean_squared_error': 0.9413306713104248,
 'loss': 0.6442892551422119,
 'regularization_loss': 0,
 'total_loss': 0.6442892551422119}

## Inference


In [50]:
trained_movie_embeddings, trained_user_embeddings, predicted_rating = mutitaks_model({
      "user_id": np.array(["42"]),
      "movie_id": np.array(['357'])
  })
print("Predicted rating:")
print(predicted_rating)

Predicted rating:
tf.Tensor([[4.253978]], shape=(1, 1), dtype=float32)
