# Imports

In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.get_logger().setLevel("ERROR")   

# Ratings dataframe

In [3]:
ratings_df = pd.read_csv(r'D:\repos\udemy\.dataset\movie_lens_data.csv')

In [4]:
ratings_df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,7,357,One Flew Over the Cuckoo's Nest (1975),46.0,879024327,m,138,4,doctor,4.0,53211
1,25.0,4,709,Strictly Ballroom (1992),32.0,875654590,m,92,5,entertainment,2.0,80525
2,18.0,4,412,"Very Brady Sequel, A (1996)",24.0,882075110,m,301,17,student,4.0,55439
3,50.0,5,56,Pulp Fiction (1994),50.0,883326919,m,60,4,healthcare,4.0,6472
4,50.0,10,895,Scream 2 (1997),55.0,891409199,m,197,18,technician,3.0,75094


In [5]:
ratings_df = ratings_df[['user_id', 'user_occupation_text', 'movie_id', 'movie_genres', 'user_gender', 'bucketized_user_age']]

In [6]:
ratings_df.shape

(100000, 6)

In [7]:
ratings_df.head()

Unnamed: 0,user_id,user_occupation_text,movie_id,movie_genres,user_gender,bucketized_user_age
0,138,doctor,357,7,m,45.0
1,92,entertainment,709,4,m,25.0
2,301,student,412,4,m,18.0
3,60,healthcare,56,5,m,50.0
4,197,technician,895,10,m,50.0


In [8]:
ratings_df.dtypes

user_id                   int64
user_occupation_text     object
movie_id                  int64
movie_genres              int64
user_gender              object
bucketized_user_age     float64
dtype: object

In [9]:
# Are there any duplicates
len(ratings_df[['movie_id', 'user_id']].drop_duplicates())

100000

In [10]:
ratings_df.user_id.nunique()

943

In [11]:
ratings_df.movie_id.nunique()

1682

In [12]:
ratings_df.user_occupation_text.unique()

array(['doctor', 'entertainment', 'student', 'healthcare', 'technician',
       'artist', 'writer', 'engineer', 'administrator', 'librarian',
       'other', 'marketing', 'scientist', 'programmer', 'educator',
       'executive', 'retired', 'homemaker', 'salesman', 'lawyer', 'none'],
      dtype=object)

In [13]:
ratings_df.user_id = ratings_df.user_id.astype('str')
ratings_df.movie_id = ratings_df.movie_id.astype('str')
ratings_df.movie_genres = ratings_df.movie_genres.astype('str')
ratings_df.bucketized_user_age = ratings_df.bucketized_user_age.astype('int').astype('str')

In [14]:
ratings_df.dtypes

user_id                 object
user_occupation_text    object
movie_id                object
movie_genres            object
user_gender             object
bucketized_user_age     object
dtype: object

# Unique values

In [15]:
unique_user_ids = ratings_df.user_id.unique()
unique_movies_ids = ratings_df.movie_id.unique()
unique_movie_genres = ratings_df.movie_genres.unique()
unique_user_occupation = ratings_df.user_occupation_text.unique()
unique_user_gender = ratings_df.user_gender.unique()
unique_age_bucket = ratings_df.bucketized_user_age.unique()

In [16]:
print("Checking the unique values:")
print(unique_user_gender)
print(unique_movie_genres)
print(unique_user_occupation)
print(unique_age_bucket)

Checking the unique values:
['m' 'f']
['7' '4' '5' '10' '2' '0' '1' '3' '16' '15' '14' '6' '9' '13' '19' '12'
 '17' '8' '18']
['doctor' 'entertainment' 'student' 'healthcare' 'technician' 'artist'
 'writer' 'engineer' 'administrator' 'librarian' 'other' 'marketing'
 'scientist' 'programmer' 'educator' 'executive' 'retired' 'homemaker'
 'salesman' 'lawyer' 'none']
['45' '25' '18' '50' '56' '35' '1']


# Data as tensor dataset

In [17]:
interactions_dict = {name: value for name, value in ratings_df.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)
type(interactions)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

In [18]:
movies_dict = ratings_df[['movie_id', 'movie_genres']].drop_duplicates()
movies_dict = {name: np.array(value) for name, value in movies_dict.items()}
movies = tf.data.Dataset.from_tensor_slices(movies_dict)
type(movies)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

In [19]:
ratings_df.head()

Unnamed: 0,user_id,user_occupation_text,movie_id,movie_genres,user_gender,bucketized_user_age
0,138,doctor,357,7,m,45
1,92,entertainment,709,4,m,25
2,301,student,412,4,m,18
3,60,healthcare,56,5,m,50
4,197,technician,895,10,m,50


In [20]:
for i in interactions.take(5).as_numpy_iterator():
    print(i)

{'user_id': b'138', 'user_occupation_text': b'doctor', 'movie_id': b'357', 'movie_genres': b'7', 'user_gender': b'm', 'bucketized_user_age': b'45'}
{'user_id': b'92', 'user_occupation_text': b'entertainment', 'movie_id': b'709', 'movie_genres': b'4', 'user_gender': b'm', 'bucketized_user_age': b'25'}
{'user_id': b'301', 'user_occupation_text': b'student', 'movie_id': b'412', 'movie_genres': b'4', 'user_gender': b'm', 'bucketized_user_age': b'18'}
{'user_id': b'60', 'user_occupation_text': b'healthcare', 'movie_id': b'56', 'movie_genres': b'5', 'user_gender': b'm', 'bucketized_user_age': b'50'}
{'user_id': b'197', 'user_occupation_text': b'technician', 'movie_id': b'895', 'movie_genres': b'10', 'user_gender': b'm', 'bucketized_user_age': b'50'}


In [21]:
for i in movies.take(5).as_numpy_iterator():
    print(i)

{'movie_id': b'357', 'movie_genres': b'7'}
{'movie_id': b'709', 'movie_genres': b'4'}
{'movie_id': b'412', 'movie_genres': b'4'}
{'movie_id': b'56', 'movie_genres': b'5'}
{'movie_id': b'895', 'movie_genres': b'10'}


In [22]:
interactions = interactions.map(
    lambda x: {
        'user_id' : x['user_id'], 
        'movie_id' : x['movie_id'], 
        'movie_genres' : x['movie_genres'],
        "user_occupation_text": x["user_occupation_text"],
        "user_gender": x["user_gender"],
        "bucketized_user_age": x["bucketized_user_age"]
    }
)

In [23]:
type(interactions)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [24]:
movies = movies.map(
    lambda x: {
        'movie_id' : x['movie_id'],
        'movie_genres' : x['movie_genres']
    }
)

In [25]:
type(movies)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [26]:
for i in interactions.take(5).as_numpy_iterator():
    print(i)

{'user_id': b'138', 'movie_id': b'357', 'movie_genres': b'7', 'user_occupation_text': b'doctor', 'user_gender': b'm', 'bucketized_user_age': b'45'}
{'user_id': b'92', 'movie_id': b'709', 'movie_genres': b'4', 'user_occupation_text': b'entertainment', 'user_gender': b'm', 'bucketized_user_age': b'25'}
{'user_id': b'301', 'movie_id': b'412', 'movie_genres': b'4', 'user_occupation_text': b'student', 'user_gender': b'm', 'bucketized_user_age': b'18'}
{'user_id': b'60', 'movie_id': b'56', 'movie_genres': b'5', 'user_occupation_text': b'healthcare', 'user_gender': b'm', 'bucketized_user_age': b'50'}
{'user_id': b'197', 'movie_id': b'895', 'movie_genres': b'10', 'user_occupation_text': b'technician', 'user_gender': b'm', 'bucketized_user_age': b'50'}


In [27]:
for i in movies.take(5).as_numpy_iterator():
    print(i)

{'movie_id': b'357', 'movie_genres': b'7'}
{'movie_id': b'709', 'movie_genres': b'4'}
{'movie_id': b'412', 'movie_genres': b'4'}
{'movie_id': b'56', 'movie_genres': b'5'}
{'movie_id': b'895', 'movie_genres': b'10'}


In [28]:
trainset_size = int(0.8 * ratings_df.__len__())

In [29]:
tf.random.set_seed(42)
# Shuffle the elements of the dataset randomly.
ratings_dataset_shuffled = interactions.shuffle(
    # the new dataset will be sampled from a buffer window of first `buffer_size`
    # elements of the dataset
    buffer_size=len(ratings_df),
    # set the random seed that will be used to create the distribution.
    seed=42,
    # `list(dataset.as_numpy_iterator()` yields different result for each call
    # Because reshuffle_each_iteration defaults to True.
    reshuffle_each_iteration=False
)

# Train test split

In [30]:
train = ratings_dataset_shuffled.take(trainset_size).batch(2048)
test = ratings_dataset_shuffled.skip(trainset_size).batch(2048,drop_remainder=True)

In [31]:
for i in train.take(1).as_numpy_iterator():
    print(i)

{'user_id': array([b'681', b'442', b'932', ..., b'305', b'269', b'115'], dtype=object), 'movie_id': array([b'898', b'367', b'484', ..., b'88', b'403', b'178'], dtype=object), 'movie_genres': array([b'7', b'4', b'9', ..., b'4', b'0', b'7'], dtype=object), 'user_occupation_text': array([b'marketing', b'student', b'educator', ..., b'programmer',
       b'librarian', b'engineer'], dtype=object), 'user_gender': array([b'f', b'm', b'm', ..., b'm', b'f', b'm'], dtype=object), 'bucketized_user_age': array([b'35', b'18', b'56', ..., b'18', b'25', b'25'], dtype=object)}


In [32]:
for i in test.take(1).as_numpy_iterator():
    print(i)

{'user_id': array([b'346', b'602', b'393', ..., b'195', b'450', b'42'], dtype=object), 'movie_id': array([b'211', b'678', b'135', ..., b'93', b'506', b'161'], dtype=object), 'movie_genres': array([b'4', b'7', b'7', ..., b'4', b'7', b'0'], dtype=object), 'user_occupation_text': array([b'other', b'other', b'student', ..., b'scientist', b'educator',
       b'administrator'], dtype=object), 'user_gender': array([b'm', b'f', b'm', ..., b'm', b'f', b'm'], dtype=object), 'bucketized_user_age': array([b'25', b'45', b'18', ..., b'35', b'35', b'25'], dtype=object)}


In [33]:
print("ratings_trainset size: %d" % train.__len__())
print("ratings_testset size: %d" % test.__len__())

ratings_trainset size: 40
ratings_testset size: 9


In [34]:
def inspect_embeddings(unique_user_ids, dims):
    lookup = tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None)
    embedding_layer = tf.keras.layers.Embedding(input_dim=lookup.vocab_size()+1, output_dim=dims)
    embedding_model = tf.keras.Sequential([lookup, embedding_layer])
    return lookup, embedding_model

In [35]:
lookup, embedding_model = inspect_embeddings(unique_user_ids, 32)

In [36]:
print(f"Mapped integer for user ids: {[b'346']} -> {lookup([b'346'])}")
print(f"Embedding output for user ids: {[b'346']} -> \n{embedding_model([b'346'])}")

Mapped integer for user ids: [b'346'] -> [187]
Embedding output for user ids: [b'346'] -> 
[[ 0.00860835  0.03325791 -0.02971803 -0.02075641 -0.00764825 -0.04563844
   0.01859008 -0.01827854 -0.03634527 -0.02486396  0.01325032  0.04104123
  -0.02533175  0.02478793 -0.01290418  0.02862498 -0.04919645  0.02861983
   0.03364236 -0.0482375  -0.03242952  0.04528551  0.01942979  0.03789414
  -0.02080848 -0.00676461  0.01499803 -0.0090589  -0.02230252 -0.00789442
  -0.03975085 -0.02763752]]


In [37]:
print(f"Mapped integer for user ids: {['346']} -> {lookup(['346'])}")
print(f"Embedding output for user ids: {['346']} -> \n{embedding_model(['346'])}")

Mapped integer for user ids: ['346'] -> [187]
Embedding output for user ids: ['346'] -> 
[[ 0.00860835  0.03325791 -0.02971803 -0.02075641 -0.00764825 -0.04563844
   0.01859008 -0.01827854 -0.03634527 -0.02486396  0.01325032  0.04104123
  -0.02533175  0.02478793 -0.01290418  0.02862498 -0.04919645  0.02861983
   0.03364236 -0.0482375  -0.03242952  0.04528551  0.01942979  0.03789414
  -0.02080848 -0.00676461  0.01499803 -0.0090589  -0.02230252 -0.00789442
  -0.03975085 -0.02763752]]


# Model

In [38]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])
    
    self.user_occupation_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_user_occupation, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_occupation) + 1, 32),
    ])

    self.user_gender_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_user_gender, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_gender) + 1, 16),
    ])

    self.user_age_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_age_bucket, mask_token=None),
        tf.keras.layers.Embedding(len(unique_age_bucket) + 1, 32),
    ])

  def call(self, inputs):
    return tf.concat([
        self.user_embedding(inputs["user_id"]),
        self.user_occupation_embedding(inputs["user_occupation_text"]),
        self.user_gender_embedding(inputs["user_gender"]),
        self.user_age_embedding(inputs["bucketized_user_age"])
    ], axis=1)

In [39]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.movie_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_movies_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movies_ids) + 1, 32),
    ])

    self.movie_genre_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_movie_genres, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movie_genres) + 1, 32),
    ])

  def call(self, inputs):
    return tf.concat([
        self.movie_embedding(inputs["movie_id"]),
        self.movie_genre_embedding(inputs["movie_genres"]),
    ], axis=1)

In [40]:
movie_model = MovieModel()  

In [41]:
class MovielensModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      UserModel(),
      tf.keras.layers.Dense(128)
    ])
    self.candidate_model = tf.keras.Sequential([
      MovieModel(),
      tf.keras.layers.Dense(128)
    ])
    self.task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
      candidates=movies.batch(128).map(self.candidate_model)
    ),
    )

  def compute_loss(self, features, training=False):
    query_embeddings = self.query_model({
        "user_id": features["user_id"],
        "user_occupation_text": features["user_occupation_text"],
        "user_gender": features["user_gender"],
        "bucketized_user_age": features["bucketized_user_age"]
    })
    movie_embeddings = self.candidate_model({
        "movie_id": features["movie_id"],
        "movie_genres": features["movie_genres"]
    })

    return self.task(query_embeddings, movie_embeddings, compute_metrics=not training)

In [42]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [151]:
model.fit(train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x17505da0670>

In [152]:
model.evaluate(test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.00032552084303461015,
 'factorized_top_k/top_5_categorical_accuracy': 0.0035807292442768812,
 'factorized_top_k/top_10_categorical_accuracy': 0.011067708022892475,
 'factorized_top_k/top_50_categorical_accuracy': 0.0967881977558136,
 'factorized_top_k/top_100_categorical_accuracy': 0.2119683176279068,
 'loss': 15058.53125,
 'regularization_loss': 0,
 'total_loss': 15058.53125}