# Package 

In [62]:
import tensorflow as tf
import tensorflow_datasets as tfd
import tensorflow_ranking as tfr
from typing import Dict, Tuple

# Config 

In [63]:
batch_size = 100

# Data process 

## Raw data 

In [64]:
ratings = tfd.load(name='movielens/100k-ratings', split='train')
movies = tfd.load(name='movielens/100k-movies', split='train')

In [65]:
# list(ratings.take(2).as_numpy_iterator())

In [66]:
ratings = ratings.map( lambda x: {
    "user_id": x['user_id']
    , 'movie_id': x['movie_id']
    , 'user_rating': x['user_rating']
}
)

In [67]:
list(ratings.take(2).as_numpy_iterator())

[{'user_id': b'138', 'movie_id': b'357', 'user_rating': 4.0},
 {'user_id': b'92', 'movie_id': b'709', 'user_rating': 2.0}]

In [68]:
list(movies.take(2).as_numpy_iterator())

[{'movie_genres': array([4]),
  'movie_id': b'1681',
  'movie_title': b'You So Crazy (1994)'},
 {'movie_genres': array([4, 7]),
  'movie_id': b'1457',
  'movie_title': b'Love Is All There Is (1996)'}]

## Id map

In [69]:
movie_ids = movies.map(lambda x: x['movie_id'])
user_ids = ratings.map(lambda x: x['user_id'])

movie_vocab = tf.keras.layers.StringLookup()
user_vocab = tf.keras.layers.StringLookup()

movie_vocab.adapt(movie_ids.batch(batch_size))
user_vocab.adapt(user_ids.batch(batch_size))

In [70]:
movie_vocab

<keras.layers.preprocessing.string_lookup.StringLookup at 0x7ff33565d9e8>

## train data -> aggated by user_id

In [71]:
key_func = lambda x: user_vocab(x['user_id'])
reduce_func = lambda key, dataset: dataset.batch(batch_size)
window_size = 1000

ds_train = ratings.group_by_window(key_func=key_func, reduce_func=reduce_func,
                         window_size=window_size
                        )

In [72]:
def get_feature_label(x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
    label = x.pop('user_rating')
    return x, label

ds_train = ds_train.map(lambda x: get_feature_label(x))

In [73]:
batch_size

100

In [74]:
ds_train = ds_train.apply(
    tf.data.experimental.dense_to_ragged_batch(batch_size=32))

In [77]:
for x, label in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:3, :3].numpy()}")
    print()
  print(f"Shape of label: {label.shape}")
  print(f"Example values of label: {label[:3, :3].numpy()}")

Shape of user_id: (32, None)
Example values of user_id: [[b'405' b'405' b'405']
 [b'405' b'405' b'405']
 [b'405' b'405' b'405']]

Shape of movie_id: (32, None)
Example values of movie_id: [[b'530' b'98' b'1415']
 [b'1073' b'1268' b'1091']
 [b'202' b'1429' b'1308']]

Shape of label: (32, None)
Example values of label: [[1. 4. 1.]
 [1. 1. 1.]
 [4. 1. 1.]]


# Model Definition 

In [78]:
tf.keras.Model

keras.engine.training.Model

In [79]:
user_vocab.vocab_size()





944

In [80]:
class MovieLenModel(tf.keras.Model):
    def __init__(self, user_vocab, movie_vocab):
        super().__init__()
        embedding_dim = 24
        self.user_embedding = tf.keras.layers.Embedding(input_dim=user_vocab.vocabulary_size()
                                                       , output_dim=embedding_dim)
        self.movie_embedding = tf.keras.layers.Embedding(input_dim=movie_vocab.vocabulary_size()
                                                       , output_dim=embedding_dim)
        self.user_vocab = user_vocab
        self.movie_vocab = movie_vocab
    
    def call(self, features: Dict[str, tf.Tensor]):
        user_emb = self.user_embedding(self.user_vocab(features['user_id']))
        movie_emb = self.movie_embedding(self.movie_vocab(features['movie_id']))
        score = tf.math.reduce_sum(user_emb*movie_emb, axis=2)
        return score

In [81]:
# MovieLenModel()

# Training 

## pointwise loss 

In [82]:
model1 = MovieLenModel(user_vocab=user_vocab, movie_vocab=movie_vocab)

loss = tfr.keras.losses.MeanSquaredLoss(ragged=True)
optimizor = tf.keras.optimizers.Adam(learning_rate=0.01)
metrics = [
     tfr.keras.metrics.NDCGMetric(ragged=True)
    , tfr.keras.metrics.MRRMetric(ragged=True)
]

model1.compile(loss=loss, optimizer=optimizor, metrics=metrics)


In [83]:
model1.fit(ds_train, epochs=4)

Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7ff335b302b0>

## pairwise loss

In [86]:
model1 = MovieLenModel(user_vocab=user_vocab, movie_vocab=movie_vocab)

loss = tfr.keras.losses.PairwiseLogisticLoss(ragged=True)
optimizor = tf.keras.optimizers.Adam(learning_rate=0.01)
metrics = [
     tfr.keras.metrics.NDCGMetric(ragged=True)
    , tfr.keras.metrics.MRRMetric(ragged=True)
]

model1.compile(loss=loss, optimizer=optimizor, metrics=metrics)
model1.fit(ds_train, epochs=4)

Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7ff3245c8438>

## listwise loss 

In [87]:
model1 = MovieLenModel(user_vocab=user_vocab, movie_vocab=movie_vocab)

loss = tfr.keras.losses.ApproxNDCGLoss(ragged=True)
optimizor = tf.keras.optimizers.Adam(learning_rate=0.01)
metrics = [
     tfr.keras.metrics.NDCGMetric(ragged=True)
    , tfr.keras.metrics.MRRMetric(ragged=True)
]

model1.compile(loss=loss, optimizer=optimizor, metrics=metrics)
model1.fit(ds_train, epochs=4)

Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7ff32438a5f8>