In [None]:
!pip install --upgrade tensorflow tensorflow_recommenders

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cd /content/drive/My Drive/data/

In [None]:
import warnings
from ast import literal_eval
from datetime import datetime
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

tf.random.set_seed(42)
warnings.filterwarnings('ignore')

In [None]:
def get_text(text, obj='name'):
    text = literal_eval(text)

    if len(text) == 1:
        for i in text:
            return i[obj]
    else:
        s = []
        for i in text:
            s.append(i[obj])
        return ', '.join(s)

In [None]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
movies = pd.read_csv('movies_metadata.csv')
ratings_df = pd.read_csv('ratings_small.csv')

In [None]:
movies.drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1, inplace=True)
movies.drop([19730, 29503, 35587], inplace=True)
movies['id'] = movies['id'].astype('int64')

df = movies.merge(keywords, on='id').merge(credits, on='id')
df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')
df.dropna(inplace=True)

df['genres'] = df['genres'].apply(get_text)
df['production_companies'] = df['production_companies'].apply(get_text)
df['production_countries'] = df['production_countries'].apply(get_text)
df['crew'] = df['crew'].apply(get_text)
df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)
df['characters'] = df['cast'].apply(get_text, obj='character')
df['actors'] = df['cast'].apply(get_text)

df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

ratings_df['date'] = ratings_df['timestamp'].apply(
    lambda x: datetime.fromtimestamp(x))
ratings_df.drop('timestamp', axis=1, inplace=True)

ratings_df = ratings_df.merge(
    df[['id', 'original_title', 'genres', 'overview']], left_on='movieId', right_on='id', how='left')
ratings_df = ratings_df[~ratings_df['id'].isna()]
ratings_df.drop('id', axis=1, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

movies_df = df[['id', 'original_title']]
movies_df.rename(columns={'id': 'movieId'}, inplace=True)

In [None]:
df.head(3)

In [None]:
ratings_df.head(3)

In [None]:
movies_df.head(3)

In [None]:
ratings_df['userId'] = ratings_df['userId'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'original_title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['original_title']]))

ratings = ratings.map(lambda x: {
    "original_title": x["original_title"],
    "userId": x["userId"],
    "rating": float(x["rating"])
})

movies = movies.map(lambda x: x["original_title"])
print(f'Total Data: {len(ratings)}')

In [None]:
ratings.isnull().sum()

In [None]:
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [None]:
movie_titles = movies.batch(512)
user_id = ratings.batch(512).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_id = np.unique(np.concatenate(list(user_id)))

print(f'Unique Movies: {len(unique_movie_titles)}')
print(f'Unique users: {len(unique_user_id)}')

In [None]:
class MovieModel(tfrs.models.Model):
    def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
        super().__init__()
        embedding_dimension = 64
        
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_movie_titles) + 1, embedding_dimension)
        ])
        
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_id, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_user_id) + 1, embedding_dimension)
        ])
        
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1),
        ])
        
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        user_embeddings = self.user_model(features['userId'])
        movie_embeddings = self.movie_model(features['original_title'])

        return (
            user_embeddings,
            movie_embeddings,
            self.rating_model(
                tf.concat([user_embeddings, movie_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        ratings = features.pop('rating')
        user_embeddings, movie_embeddings, rating_predictions = self(features)
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

In [None]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
model.fit(train.batch(128), epochs=5)

In [None]:
metrics = model.evaluate(test.batch(128), return_dict=True)

In [None]:
print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")

In [None]:
def predict_movie(user, n):
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    index.index_from_dataset(
        tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )
    _, titles = index(tf.constant([str(user)]))
    
    print(f'Top {n} recommendations for user {user}:')
    for i, title in enumerate(titles[0, :n].numpy()):
        print(f'{i + 1}. {title.decode("utf-8")}')

In [None]:
def predict_rating(user, movie):
    movie_embeddings, user_embeddings, predicted_rating = model({
        "userId": np.array([str(user)]),
        "original_title": np.array([movie])
    })
    
    print(f"Predicted rating for {movie}: {predicted_rating.numpy()[0][0]}")

In [None]:
predict_movie(5, 5)

In [None]:
predict_rating(5, 'The Searchers')