# Setup

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann
!pip install git+git://github.com/altair-viz/altair.git

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import manifold
from time import process_time
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
# import altair as alt

In [2]:
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

GPU is available


In [3]:
def plot_pred_ratings(pred,y_true,alpha=0.1):
    plt.scatter(range(0,len(y_true)),y_true,label='True')
    plt.scatter(range(0,len(pred)),pred,c='r',alpha=alpha,label='Pred')
    plt.legend()
    plt.show()

def plotting_history(rmse, val_rmse, loss, val_loss):
  plt.figure(figsize=(8, 8))
  plt.subplot(2, 1, 1)
  plt.plot(rmse, label='Training RMSE')
  plt.plot(val_rmse, label='Validation RMSE')
  plt.legend(loc='upper right')
  plt.ylabel('RMSE')
  # plt.ylim([min(plt.ylim()),1])
  # plt.ylim([0,1.0])
  plt.title('Training and Validation RMSE')

  plt.subplot(2, 1, 2)
  plt.plot(loss, label='Training Loss')
  plt.plot(val_loss, label='Validation Loss')
  plt.legend(loc='upper right')
  plt.ylabel('Cross Entropy')
  # plt.ylim([0,1.0])
  plt.title('Training and Validation Loss')
  plt.xlabel('epoch')
  plt.show() 

In [4]:
# Load ratings dataset from 100K MovieLens
# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split='train')
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split='train')

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2021-10-15 15:20:11.091070: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-10-15 15:20:11.092108: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
ratings_df = tfds.as_dataframe(ratings)
movies_df = tfds.as_dataframe(movies)

In [None]:
ratings_df.info()
movies_df.info()

In [None]:
ratings_df.head()

In [None]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)
  # print(x)

In [None]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)
  print(x['movie_genres'])

In [None]:
# ratings = ratings.map(lambda x: {
#     "movie_title": x["movie_title"],
#     "user_id": x["user_id"],
# })
movies = movies.map(lambda x: x["movie_title"])

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"]
})

In [None]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

In [None]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

# 1.Splitting dataset

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100000, seed=42, reshuffle_each_iteration=False)

train_set = shuffled.take(80000)
val_set = shuffled.skip(80000).take(20000)
test_set = shuffled.skip(60000).take(20000)

### 1.1 For Retrival stage

In [None]:
movie_titles = movies.batch(1000)
user_ids = ratings.batch(1000000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# unique_movie_titles[:10]
len(unique_movie_titles)

### 1.2 For Ranking stage

In [None]:
movie_titles = ratings.batch(1_000_000).map(lambda x: x["movie_title"])
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# 2.Implementing the Basic Retrival model

In [None]:
embedding_dim = 35

In [None]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dim)
])

## Candidate Tower

In [None]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dim)
])

## Choosing metric

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

## Choosing loss

In [None]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

## Put into a full model: Basic Retrival

In [None]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # pick user features and pass to user model
    user_embeddings = self.user_model(features["user_id"])

    positive_movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, positive_movie_embeddings)

## Fitting and evaluating the model
The optimizer is Adagrad with learning_rate = 0.1

In [None]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

Shuffle the batch, and cache the training and evaluation data.

In [None]:
cached_train = train_set.shuffle(100000).batch(8192).cache()
cached_test = test_set.batch(4096).cache()

### Train the model

In [None]:
model.fit(cached_train, epochs=10)

In [None]:
model.evaluate(cached_test, return_dict=True)

## Making Prediction

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get recommendations
user_id = str(42)
_, titles = index(tf.constant([user_id]))
print(f"Recommendations for user {user_id}: {titles[0, :3]}")

# 3.Implement the Ranking model

In [None]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 35

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dropout(0.3),
      tf.keras.layers.Dense(128, activation="relu"),
      tf.keras.layers.Dropout(0.3),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  def call(self, inputs):
    high = 5.0
    low = 0.5

    user_id, movie_title = inputs

    user_embedding = self.user_embeddings(user_id)
    movie_embedding = self.movie_embeddings(movie_title)
    # self.outputs = tf.keras.layers.Dense(1, activation='sigmoid')(self.ratings(tf.concat([user_embedding, movie_embedding], axis=1)))

    return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))*(high-low) + low

__Test this ranking model__
+ __Inputs:__ user_id and movie_name
+ __Output:__ predicted rating of the user on the movie

In [None]:
RankingModel()((["42"], ["One Flew Over the Cuckoo's Nest (1975)"]))

## Choosing loss function and metrics

Metrics: RMSE

## Put into a full model: Ranking

In [None]:
class MovielensModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["user_id"], features["movie_title"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("user_rating")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [None]:
ranking_model = MovielensModel()
# ranking_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.09))
ranking_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.0001)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

In [None]:
cached_train = train_set.shuffle(100000).batch(8192).cache()
cached_test = test_set.batch(4096).cache()
cached_val = val_set.batch(4096).cache()

## Fitting and evaluating the Ranking model

### Train the model

In [None]:
history_ranking = ranking_model.fit(cached_train, 
                                    validation_data=cached_val, 
                                    epochs=100, 
                                    callbacks = [reduce_lr])

In [None]:
ranking_model.evaluate(cached_test, return_dict=True)

In [None]:
rmse = history_ranking.history['root_mean_squared_error']
val_rmse = history_ranking.history['val_root_mean_squared_error']

loss = history_ranking.history['loss']
val_loss = history_ranking.history['val_loss']

plotting_history(rmse, val_rmse, loss, val_loss)

### Saving the model

In [None]:
import datetime, os
curr_date = str(datetime.datetime.now().year) + '_' + str(datetime.datetime.now().month) + '_' + str(datetime.datetime.now().day)
# model_folder = '/content/gdrive/MyDrive/Colab_Notebooks/XProject/model'
# model_path = os.path.join(model_folder, f'ranking_model_w_{curr_date}')
model_path = './ranking_model_w_2021_10_14.h5'

In [None]:
# ranking_model.save_weights(model_path)
# ranking_model.save(model_path)
tf.saved_model.save(ranking_model, model_path)

### Loding the model

In [None]:
# Assign the model
loaded_ranking_model = MovielensModel()
# load_ranking_model = tf.saved_model.load(model_path)

# Compile the model
loaded_ranking_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
loaded_ranking_model.fit(cached_train, epochs=1)

# Restore the weights
loaded_ranking_model.load_weights(model_path)

In [None]:
loaded_ranking_model.evaluate(cached_test, return_dict=True)

## Plot the embedding space

In [None]:
#@title Functions for plotting
def visualize_movie_embeddings(data, x, y):
  nearest = alt.selection(
      type='single', encodings=['x', 'y'], on='mouseover', nearest=True,
      empty='none')
  base = alt.Chart().mark_circle().encode(
      x=x,
      y=y,
      color=alt.condition(genre_filter, "genre", alt.value("whitesmoke")),
  ).properties(
      width=600,
      height=600,
      selection=nearest)
  text = alt.Chart().mark_text(align='left', dx=5, dy=-5).encode(
      x=x,
      y=y,
      text=alt.condition(nearest, 'title', alt.value('')))
  return alt.hconcat(alt.layer(base, text), genre_chart, data=data)

def tsne_movie_embeddings(model):
  """Visualizes the movie embeddings, projected using t-SNE with Cosine measure.
  Args:
    model: A MFModel object.
  """
  tsne = sklearn.manifold.TSNE(
      n_components=2, perplexity=40, metric='cosine', early_exaggeration=10.0,
      init='pca', verbose=True, n_iter=400)

  print('Running t-SNE...')
  V_proj = tsne.fit_transform(model.ranking_model.movie_embeddings["movie_id"])
  movies.loc[:,'x'] = V_proj[:, 0]
  movies.loc[:,'y'] = V_proj[:, 1]
  return visualize_movie_embeddings(movies, 'x', 'y')

In [None]:
tsne_movie_embeddings(ranking_model)

## Test the model

In [None]:
ratings_pred = {}
test_movie_titles = ["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)"]
for movie in test_movie_titles:

  ratings_pred[movie] = ranking_model({
      "user_id": np.array(["42"]),
      "movie_title": np.array([movie])
      })

print("Ratings:")
for title, score in sorted(ratings_pred.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

# 4.Adding extra features
Other features of the movie data:

+ Movie genres
+ User occupations
+ Movie title (Categorical)
+ User Id (Categorical)
+ Timestamp (Continuous)


## Turning categorical features into embeddings

### Define the vocabulary

In [None]:
ratings_new = tfds.load('movielens/100k-ratings', split='train')
movies_new = tfds.load('movielens/100k-movies', split='train')

In [None]:
for i in movies_new.take(1).as_numpy_iterator():
  pprint.pprint(i)

In [None]:
# Create a movie titles vocabulary
movie_title_lookup = tf.keras.layers.StringLookup()
# Build the vocabulary
movie_title_lookup.adapt(ratings_new.map(lambda x: x["movie_title"]))

In [None]:
print(f"Vocabulary: {movie_title_lookup.get_vocabulary()[:10]}")

In [None]:
# Can use this layer to translate raw token to embedding ids
movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)", 'Fargo (1996)', 'Daredevil'])

### Defining embedding layer

In [None]:
embed_dim = 35
movie_title_embedding = tf.keras.layers.Embedding(
    # Using the vocabulary lookup layer
    input_dim=movie_title_lookup.vocabulary_size(), 
    output_dim=embed_dim
)

__Put into one layer__

In [None]:
movie_title_model = tf.keras.Sequential([movie_title_lookup, movie_title_embedding])
# Plotting to see the layer structure
tf.keras.utils.plot_model(movie_title_model, "feature_embedding.png", show_shapes=True)

In [None]:
movie_title_model(["Star Wars (1977)"])

### Applying on user id

In [None]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(ratings_new.map(lambda x: x["user_id"]))

user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocabulary_size(), 35)

user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])

### Applying on movie genres

In [None]:
genre_lookup = tf.keras.layers.StringLookup()
genre_lookup.adapt(ratings_new.map(lambda x: x["movie_genres"]))

genre_embedding = tf.keras.layers.Embedding(genre_lookup.vocabulary_size(), 35)

genre_model = tf.keras.Sequential([genre_lookup, genre_embedding])