## 1. 🚚 Data Loading & Preparation

Let's start by loading the pre-cleaned MovieLens ratings data and prepping it for modeling.

In [2]:
import pandas as pd

# Load cleaned ratings data
df = pd.read_csv("../data/movielens/cleaned_ratings.csv")
print("Sample of the dataset:")
print(df.head())

Sample of the dataset:
   user_id  movie_id  rating            timestamp                       title  \
0      196       242       3  1997-12-04 15:55:49                Kolya (1996)   
1      186       302       3  1998-04-04 19:22:22    L.A. Confidential (1997)   
2       22       377       1  1997-11-07 07:18:36         Heavyweights (1994)   
3      244        51       2  1997-11-27 05:02:03  Legends of the Fall (1994)   
4      166       346       1  1998-02-02 05:33:16         Jackie Brown (1997)   

  release_date  video_release_date  \
0  24-Jan-1997                 NaN   
1  01-Jan-1997                 NaN   
2  01-Jan-1994                 NaN   
3  01-Jan-1994                 NaN   
4  01-Jan-1997                 NaN   

                                            IMDb_URL  unknown  Action  ...  \
0    http://us.imdb.com/M/title-exact?Kolya%20(1996)        0       0  ...   
1  http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...        0       0  ...   
2  http://us.imdb.com/M/tit

### Explanation: 
> The dataaset contains user ratings for movies, with each row representing a rating containing the `user_id, movie_id, rating, timestamp, title`. I will be focusing on user, movie_id, and rating for the baseline model. 

## 2. Data Formatting for TensorFlow
Here we shuffle and split the data to ensure a fair evaluation and avoid overfitting. IDs are converted to strings so they can be embedded by the neural network.

In [5]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

import tensorflow as tf

# Ensure user and movie IDs are strings for embeddings
df["user_id"] = df["user_id"].astype(str)
df["movie_id"] = df["movie_id"].astype(str)

# Shuffle and split data
shuffled = df.sample(frac=1, random_state=42)
n = int(len(shuffled) * 0.8)
train = shuffled.iloc[:n]
test = shuffled.iloc[n:]

# Convert to tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices({
    "user_id": train["user_id"].values,
    "movie_id": train["movie_id"].values,
    "rating": train["rating"].astype(float).values,
})
test_ds = tf.data.Dataset.from_tensor_slices({
    "user_id": test["user_id"].values,
    "movie_id": test["movie_id"].values,
    "rating": test["rating"].astype(float).values,
})

## 3. Building a Two-Tower Deep Learning Model
We'll use a two-tower architecture, where one tower learns user representations and the other learns movie representations. Their similarity determines recommendations.

In [6]:
import tensorflow_recommenders as tfrs
import numpy as np

print("TF:", tf.__version__)
print("TFRS:", tfrs.__version__)

unique_user_ids = np.array(train["user_id"].astype(str).unique())
unique_movie_ids = np.array(train["movie_id"].astype(str).unique())

# Debug prints
print("Type of unique_movie_ids:", type(unique_movie_ids))
print("Length of unique_movie_ids:", len(unique_movie_ids))
print("Sample of unique_movie_ids:", unique_movie_ids[:10])
assert isinstance(unique_movie_ids, (np.ndarray, list)), f"Not array/list: {type(unique_movie_ids)}"
assert isinstance(unique_movie_ids[0], str), f"Not string: {type(unique_movie_ids[0])}"

# debugging candidates
candidates = tf.data.Dataset.from_tensor_slices(unique_movie_ids).batch(128)
print("candidates:", candidates)
for batch in candidates.take(1):
    print(batch)

class MovieLensModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        embedding_dim = 32

        self.user_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dim)
        ])
        self.movie_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_movie_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dim)
        ])

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=candidates
            )
        )

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_id"])
        return self.task(user_embeddings, movie_embeddings)

TF: 2.16.1
TFRS: v0.7.3
Type of unique_movie_ids: <class 'numpy.ndarray'>
Length of unique_movie_ids: 1651
Sample of unique_movie_ids: ['381' '602' '431' '875' '182' '1074' '286' '496' '15' '184']
candidates: <_BatchDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>
tf.Tensor(
[b'381' b'602' b'431' b'875' b'182' b'1074' b'286' b'496' b'15' b'184'
 b'864' b'568' b'1197' b'99' b'31' b'14' b'176' b'618' b'476' b'82' b'492'
 b'303' b'408' b'483' b'180' b'109' b'156' b'1281' b'2' b'979' b'214'
 b'1165' b'245' b'326' b'651' b'95' b'45' b'318' b'684' b'682' b'38'
 b'257' b'77' b'1091' b'647' b'230' b'636' b'276' b'386' b'519' b'1086'
 b'205' b'187' b'234' b'934' b'240' b'742' b'547' b'273' b'1448' b'195'
 b'204' b'569' b'235' b'810' b'1015' b'845' b'324' b'26' b'219' b'1016'
 b'28' b'628' b'1446' b'288' b'395' b'100' b'83' b'428' b'289' b'278'
 b'111' b'271' b'269' b'1142' b'175' b'473' b'125' b'121' b'190' b'610'
 b'11' b'50' b'844' b'298' b'919' b'762' b'97' b'1210'

2025-06-06 12:43:23.916398: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## 4. Training the Model
Training recommendation model based on training dataset. 

In [7]:
# Prepare batched, cached datasets for efficiency
cached_train = train_ds.shuffle(100_000).batch(8192).cache()
cached_test = test_ds.batch(4096).cache()

model = MovieLensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

# Train the model
model.fit(cached_train, epochs=5)

ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Load your data
df = pd.read_csv("../data/movielens/cleaned_ratings.csv")
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)

# Get unique IDs as arrays of strings
unique_user_ids = np.array(train["user_id"].astype(str).unique())
unique_movie_ids = np.array(train["movie_id"].astype(str).unique())

# Print debug info
print("unique_movie_ids type:", type(unique_movie_ids))
print("unique_movie_ids shape:", unique_movie_ids.shape)
print("First 10 movie ids:", unique_movie_ids[:10])
print("First element type:", type(unique_movie_ids[0]))

# Make tf.data.Dataset candidates
candidates = tf.data.Dataset.from_tensor_slices(unique_movie_ids).batch(128)

# Build datasets
train_ds = tf.data.Dataset.from_tensor_slices({
    "user_id": train["user_id"].astype(str).values,
    "movie_id": train["movie_id"].astype(str).values,
    "rating": train["rating"].astype(float).values,
})
cached_train = train_ds.shuffle(100_000).batch(8192).cache()

# Model definition
class MovieLensModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        embedding_dim = 32

        self.user_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dim)
        ])
        self.movie_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_movie_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dim)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=candidates
            )
        )

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_id"])
        return self.task(user_embeddings, movie_embeddings)

# Instantiate and train
model = MovieLensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=1)

unique_movie_ids type: <class 'numpy.ndarray'>
unique_movie_ids shape: (1651,)
First 10 movie ids: ['381' '602' '431' '875' '182' '1074' '286' '496' '15' '184']
First element type: <class 'str'>


ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 