In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd /content/drive/My Drive/data/

/content/drive/My Drive/data


In [3]:
import warnings
from ast import literal_eval
from datetime import datetime
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

tf.random.set_seed(42)
warnings.filterwarnings('ignore')

In [4]:
def get_text(text, obj='name'):
    text = literal_eval(text)

    if len(text) == 1:
        for i in text:
            return i[obj]
    else:
        s = []
        for i in text:
            s.append(i[obj])
        return ', '.join(s)

In [5]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
movies = pd.read_csv('movies_metadata.csv')
ratings_df = pd.read_csv('ratings_small.csv')

In [6]:
movies.drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1, inplace=True)
movies.drop([19730, 29503, 35587], inplace=True)
movies['id'] = movies['id'].astype('int64')

df = movies.merge(keywords, on='id').merge(credits, on='id')
df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')
df.dropna(inplace=True)

df['genres'] = df['genres'].apply(get_text)
df['production_companies'] = df['production_companies'].apply(get_text)
df['production_countries'] = df['production_countries'].apply(get_text)
df['crew'] = df['crew'].apply(get_text)
df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)
df['characters'] = df['cast'].apply(get_text, obj='character')
df['actors'] = df['cast'].apply(get_text)

df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

ratings_df['date'] = ratings_df['timestamp'].apply(
    lambda x: datetime.fromtimestamp(x))
ratings_df.drop('timestamp', axis=1, inplace=True)

ratings_df = ratings_df.merge(
    df[['id', 'original_title', 'genres', 'overview']], left_on='movieId', right_on='id', how='left')
ratings_df = ratings_df[~ratings_df['id'].isna()]
ratings_df.drop('id', axis=1, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

movies_df = df[['id', 'original_title']]
movies_df.rename(columns={'id': 'movieId'}, inplace=True)

In [7]:
df.head(3)

Unnamed: 0,adult,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,...,revenue,runtime,spoken_languages,tagline,vote_average,vote_count,keywords,crew,characters,actors
0,False,30000000,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,United States of America,...,373554033.0,81.0,English,,7.7,5415.0,"jealousy, toy, boy, friendship, friends, rival...","John Lasseter, Joss Whedon, Andrew Stanton, Jo...","Woody (voice), Buzz Lightyear (voice), Mr. Pot...","Tom Hanks, Tim Allen, Don Rickles, Jim Varney,..."
1,False,65000000,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,...,262797249.0,104.0,"English, Français",Roll the dice and unleash the excitement!,6.9,2413.0,"board game, disappearance, based on children's...","Larry J. Franco, Jonathan Hensleigh, James Hor...","Alan Parrish, Samuel Alan Parrish / Van Pelt, ...","Robin Williams, Jonathan Hyde, Kirsten Dunst, ..."
2,False,0,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"Warner Bros., Lancaster Gate",United States of America,...,0.0,101.0,English,Still Yelling. Still Fighting. Still Ready for...,6.5,92.0,"fishing, best friend, duringcreditsstinger, ol...","Howard Deutch, Mark Steven Johnson, Mark Steve...","Max Goldman, John Gustafson, Ariel Gustafson, ...","Walter Matthau, Jack Lemmon, Ann-Margret, Soph..."


In [8]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,1,1371,2.5,2009-12-14 02:52:15,Rocky III,Drama,"Now the world champion, Rocky Balboa is living..."
1,1,1405,1.0,2009-12-14 02:53:23,Greed,"Drama, History",Greed is the classic 1924 silent film by Erich...
2,1,2105,4.0,2009-12-14 02:52:19,American Pie,"Comedy, Romance","At a high-school party, four friends find that..."


In [9]:
movies_df.head(3)

Unnamed: 0,movieId,original_title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men


In [10]:
ratings_df['userId'] = ratings_df['userId'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'original_title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['original_title']]))

ratings = ratings.map(lambda x: {
    "original_title": x["original_title"],
    "userId": x["userId"],
    "rating": float(x["rating"])
})

movies = movies.map(lambda x: x["original_title"])
print(f'Total Data: {len(ratings)}')

Total Data: 43188


In [11]:
shuffled = ratings.shuffle(100000, seed=42, reshuffle_each_iteration=False)
train = ratings.take(35000)
test = ratings.skip(35000).take(7777)

In [12]:
movie_titles = movies.batch(512)
user_id = ratings.batch(512).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_id = np.unique(np.concatenate(list(user_id)))

print(f'Unique Movies: {len(unique_movie_titles)}')
print(f'Unique users: {len(unique_user_id)}')

Unique Movies: 42373
Unique users: 671


In [13]:
data_train = train.shuffle(100000).batch(128).cache()
data_test = test.batch(512).cache()

In [14]:
class MovieModel(tfrs.models.Model):
    def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
        super().__init__()
        embedding_dimension = 64
        
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_movie_titles) + 1, embedding_dimension)
        ])
        
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_id, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_user_id) + 1, embedding_dimension)
        ])
        
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(1),
        ])
        
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        user_embeddings = self.user_model(features['userId'])
        movie_embeddings = self.movie_model(features['original_title'])

        return (
            user_embeddings,
            movie_embeddings,
            self.rating_model(
                tf.concat([user_embeddings, movie_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        ratings = features.pop('rating')
        user_embeddings, movie_embeddings, rating_predictions = self(features)
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

In [15]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.07))
model.fit(data_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3b76aae2d0>

In [16]:
metrics = model.evaluate(data_test, return_dict=True)
print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")


Retrieval top-100 accuracy: 0.088
Ranking RMSE: 1.115


In [17]:
model.summary()

Model: "movie_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 64)                2711936   
                                                                 
 sequential_1 (Sequential)   (None, 64)                43008     
                                                                 
 sequential_2 (Sequential)   (None, 1)                 66049     
                                                                 
 ranking (Ranking)           multiple                  0         
                                                                 
 retrieval (Retrieval)       multiple                  1         
                                                                 
Total params: 2,820,994
Trainable params: 2,820,993
Non-trainable params: 1
_________________________________________________________________


In [18]:
def predict_movie(user, n):
  index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
  index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
      )
  _, titles = index(tf.constant([str(user)]))
    
  print(f'Top {n} recommendations for user {user}:')
  for i, title in enumerate(titles[0, :n].numpy()):
    print(f'{i + 1}. {title.decode("utf-8")}')

In [19]:
def predict_rating(user, movie):
  movie_embeddings, user_embeddings, predicted_rating = model({
      "userId": np.array([str(user)]),
      "original_title": np.array([movie])
      })
    
  print(f"Predicted rating for {movie}: {predicted_rating.numpy()[0][0]}")

In [20]:
predict_movie(5, 5)

Top 5 recommendations for user 5:
1. Ober
2. مسافر
3. New York Doll
4. The Searchers
5. The Garden of Eden


In [21]:
predict_rating(5, 'The Searchers')

Predicted rating for The Searchers: 3.4802255630493164
