Data citation : https://www.kaggle.com/code/mitishaagarwal/netflix-prize-data

Original citation : https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data (Netflix Prize data)

# Neural Collaborative Filtering Recommendation System (Model)

In [None]:
import pandas as pd
train_data = pd.read_csv('/content/datasets/train.csv')
test_data = pd.read_csv('/content/datasets/test.csv')

In [None]:
# Read the CSV file while skipping bad lines
movie_titles = pd.read_csv("/content/datasets/movie_titles.csv", sep=',', header = None,
                           names=['movie_id', 'year_of_release', 'title', 'title_2', 'title_3', 'title_4'], verbose=True,
                      index_col = 'movie_id', encoding = "ISO-8859-1")

# Combine 'title', 'title_2', and 'title_3' columns into a single 'combined_title' column
movie_titles['combined_title'] = movie_titles[['title', 'title_2', 'title_3', 'title_4']].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# Drop individual title columns to retain only the combined title column
movie_titles.drop(columns=['title', 'title_2', 'title_3', 'title_4'], inplace=True)

movie_titles = movie_titles.reset_index()

# Display the first few rows of the DataFrame
movie_titles.shape

Tokenization took: 8.02 ms
Type conversion took: 12.23 ms
Parser memory cleanup took: 0.01 ms


(17770, 3)

In [None]:
print('Dimensions of train dataset : ', train_data.shape)
print('Dimensions of test dataset : ', test_data.shape)

Dimensions of train dataset :  (1048575, 4)
Dimensions of test dataset :  (1048575, 4)


In [None]:
#diagnostics
combined_df = pd.concat([train_data, test_data],ignore_index=True)

num_users_combined = len(combined_df.userID.unique())
print("the number of unique users: ", num_users_combined)

num_movies_combined = len(combined_df.movieID.unique())
print("the number of unique movies: ", num_movies_combined)

num_ratings = len(combined_df.rating)
print("number of ratings: ", num_ratings)

num_elements = num_users_combined * num_movies_combined
print("the full rating matrix should have: ",num_elements,"elements")

percentage_filled = (num_ratings / num_elements) * 100
print(percentage_filled, "% of the matrix is filled")

the number of unique users:  122565
the number of unique movies:  15882
number of ratings:  2097150
the full rating matrix should have:  1946577330 elements
0.1077352524186645 % of the matrix is filled


In order to address the issue of high sparsity (our matrix is only 0.108% filled), we have to perform matrix factorisation
- The purpose is to estimate or predict the missing values in a sparse user-item matrix
- We do this by decomposing the large user-interaction matrix, R, into 2 lower-dimensional matrices (called embeddings) for users and items
- The product of these 2 submatrices is our approximation of R, which we can then use in our recommender system

In [None]:
import numpy as np

# number of latent features
latent_features = 10

# number of unique users in the training dataset
num_users_training = len(train_data.userID.unique())
# number of unique items in the traning dataset
num_items_training = len(train_data.movieID.unique())

# creating the actual matrix with 0 as placeholder for non-rated movies
user_item_matrix = train_data.pivot(index='userID',columns='movieID',values='rating').fillna(0)
# print(user_item_matrix)

# user matrix
user_matrix = np.random.rand(num_users_training, latent_features)
print(user_matrix)

# item matrix
item_matrix = np.random.rand(num_items_training, latent_features)

[[0.17207838 0.42534637 0.65839395 ... 0.71906646 0.740008   0.59971561]
 [0.30066043 0.18271293 0.2342668  ... 0.43681656 0.45394934 0.75707883]
 [0.10661033 0.41688034 0.35936381 ... 0.69983677 0.66405643 0.7809003 ]
 ...
 [0.28409663 0.60286212 0.43296182 ... 0.42267257 0.06179583 0.89620821]
 [0.7094807  0.7603623  0.13455213 ... 0.09779999 0.09691884 0.80434778]
 [0.7164048  0.00929157 0.46637734 ... 0.6412743  0.27761569 0.52840397]]


In [None]:
# smaller (sample) data for checking
smaller_data = pd.concat([train_data.head(),train_data.tail()], ignore_index=True)
# print(smaller_data)

# smaller matrix
smaller_matrix = smaller_data.pivot(index = 'userID',columns = 'movieID',values = 'rating').fillna(0)
print(smaller_matrix)
print("\n")

# embedding for users
smaller_user_matrix = np.random.rand(6, latent_features)
print(smaller_user_matrix)
print("\n")

# embedding for items
smaller_item_matrix = np.random.rand(10, latent_features)
print(smaller_item_matrix)
print("\n")
print(smaller_item_matrix.T)

movieID  329    1798   2400   8651   10341  10774  12779  13793  14660  15381
userID                                                                       
377114     0.0    0.0    0.0    0.0    0.0    0.0    2.0    0.0    0.0    0.0
510180     0.0    5.0    0.0    2.0    4.0    3.0    0.0    0.0    2.0    0.0
680148     0.0    0.0    4.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0
1192119    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    4.0
1788180    5.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0
2582768    0.0    0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0


[[0.97046727 0.87969962 0.43768047 0.90677333 0.79388762 0.20208995
  0.56778444 0.79197428 0.26369141 0.81666452]
 [0.51748121 0.18256488 0.36460127 0.79201763 0.13713189 0.17339275
  0.97686148 0.61063822 0.40581756 0.6808908 ]
 [0.44054541 0.60638376 0.2746372  0.1393762  0.23793504 0.57387056
  0.57284267 0.02913906 0.91922212 0.61174861]
 [0.04246027 0.36219835 0.976

In [None]:
# how to access a single row in the matrix (horizontally)
firstrow = smaller_matrix.iloc[[0]]
print(firstrow)
print("\n")

# how to access a single row in the matrix (vertically)
firstrow = smaller_matrix.iloc[0]
print(firstrow)
print("\n")

# gets the ratings for each of the movies from eacha nd every users
# for item in firstrow.index:
        # rating = smaller_matrix[item]
        # print(rating)

movieID  329    1798   2400   8651   10341  10774  12779  13793  14660  15381
userID                                                                       
377114     0.0    0.0    0.0    0.0    0.0    0.0    2.0    0.0    0.0    0.0


movieID
329      0.0
1798     0.0
2400     0.0
8651     0.0
10341    0.0
10774    0.0
12779    2.0
13793    0.0
14660    0.0
15381    0.0
Name: 377114, dtype: float64




Testing the working principle of the nn.Embedding method to understand how it works and how to incorparet in collaborative filtering

In [None]:
import torch
from torch import nn

# number of unique users in the training dataset
num_users_training = len(train_data.userID.unique())
# number of unique items in the traning dataset
num_items_training = len(train_data.movieID.unique())



# testing
# this part to be done after matrix factorization (to get optimal latent feature values. now still randomized)

# user embeddings
user_embedding = nn.Embedding(num_users_training, latent_features)
user_indices = torch.LongTensor([1,2,3]) # tensor containing user indices
user_embeds = user_embedding(user_indices)
print(user_embeds)
print("\n")

# item embeddings
item_embedding = nn.Embedding(num_items_training, latent_features)
item_indices = torch.LongTensor([1,2,3]) # tensor containing item indices
item_embeds = item_embedding(item_indices)
print(item_embeds)
print("\n")

all_user_item_pairs = torch.cartesian_prod(user_indices, item_indices)
# print(all_user_item_pairs)

all_user_embeds = user_embedding(all_user_item_pairs[:, 0])
all_item_embeds = item_embedding(all_user_item_pairs[:, 1])

# Concatenate user and item embeddings for each user-item pair
user_item_embeds = torch.cat((all_user_embeds, all_item_embeds), dim=1)

print(user_item_embeds)

tensor([[-0.0364,  1.0254, -1.6430,  0.6978,  0.1644, -2.1614,  0.9825,  0.9546,
         -0.5878,  0.4509],
        [-0.4772,  0.2392,  1.3269, -0.8494,  1.0262, -0.2117, -0.6728,  0.6724,
          2.1308, -2.0784],
        [ 2.1504, -0.0107,  1.9343,  0.7831,  1.1436, -0.1862, -0.0571, -1.2564,
          0.6780,  1.6813]], grad_fn=<EmbeddingBackward0>)


tensor([[-0.2547, -0.5119, -1.0710,  0.2244, -1.2941, -0.2673,  0.2925,  0.6664,
          0.1585,  1.3610],
        [ 1.4909,  1.3809, -0.4581, -1.9810, -0.3197,  0.9922,  0.6793, -1.1011,
          0.6199,  1.5650],
        [-0.1942, -1.1281,  0.8591, -0.5086,  0.8517, -0.2058, -0.1262, -0.8411,
         -0.0047, -0.4126]], grad_fn=<EmbeddingBackward0>)


tensor([[-0.0364,  1.0254, -1.6430,  0.6978,  0.1644, -2.1614,  0.9825,  0.9546,
         -0.5878,  0.4509, -0.2547, -0.5119, -1.0710,  0.2244, -1.2941, -0.2673,
          0.2925,  0.6664,  0.1585,  1.3610],
        [-0.0364,  1.0254, -1.6430,  0.6978,  0.1644, -2.1614,  0.9825, 

In [None]:
# testing
# performing matrix multiplication
mf_vector = torch.mm(user_embeds, item_embeds.T)
print(mf_vector)

tensor([[ 3.2096, -1.5073, -3.4416],
        [-5.1236, -2.9736,  2.6791],
        [-2.3265,  4.5993,  2.2374]], grad_fn=<MmBackward0>)


In [None]:
# testing
import torch
from torch import nn

# the user embedding layer maps user indices to user embeddings
user_embedding = nn.Embedding(num_users_training, latent_features)
print(user_embedding)

Embedding(9174, 10)


In [None]:
import torch
from torch import nn
# testing (matrix multiplication)
user_embedding_mf = nn.Embedding(num_users_training, latent_features)
item_embedding_mf = nn.Embedding(num_items_training, latent_features)


print(user_embedding_mf)

user_embeds = user_embedding_mf()

# Access the learned embeddings
user_embeddings = user_embedding_mf.weight.data

# Print the embeddings
print("User Embeddings:")
print(user_embeddings)


# print(num_users_training)
# print(latent_features)
# mf_vector = nn.Dropout(config.dropout_rate_mf)(mf_vector)

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader # Dataset is an abstract class

# loading data, preprocessing,
class Loader(Dataset):

  def __init__(self, users, items, ratings):
    self.users = torch.tensor(users, dtype=torch.long)
    self.items = torch.tensor(items, dtype=torch.long)
    self.ratings = torch.tensor(ratings, dtype=torch.float32)

  def __len__(self):
    return len(self.users)

  def __getitem__(self, idx):
    return {
        'users': self.users[idx]
        'items': self.items[idx]
        'ratings': self.ratings[idx]
    }


In [None]:
import torch
from torch import nn

# number of unique users in the training dataset
num_users_training = len(train_data.userID.unique())
# number of unique items in the traning dataset
num_items_training = len(train_data.movieID.unique())

class MatrixFactorization(nn.Module):

  # initializing attributes
  def __init__(self, num_users_training, num_items_training, latent_features=10):

    super().__init__()

    # creating user and item embedding layers for matrix factorization
    self.user_embedding = nn.Embedding(num_users_training, latent_features)
    self.item_embedding = nn.Embedding(num_items_training, latent_features)

  # Define a matrix factorization model class using PyTorch's nn.Module.
  # The forward pass returns the dot product of user and item embeddings, summing over latent features.
  def forward(self, user, item):
    return(self.user_embedding(user) * self.item_embedding(item)).sum(1)


    # leaky relu activation function
    # self.relu = nn.LeakyReLU()

    # container to store all the layers of the neural network
    # self.layer_container = nn.ModuleList()

    # initializing the weights of the user and item embedding layers with values sampled from a uniform distribution
    # these values are between 0 and 0.05. Since a normal distribution is used, most values would centre around the mean
    # we just assign an arbitrary value for the weights at the start, and the values of the weights will be tweaked when we train our model
    # self.user_embedding.weight.data.uniform(0, 0.05)
    # self.item_embedding.weight.data.uniform(0, 0.05)

In [None]:
    # when training, call this function (it specifies how input data is processed through the layers of the network to produce an output)
    def forward(self, user_indices, item_indices):

      user_embeds = self.user_embedding(user_indices)
      item_embeds = self.item_embedding(item_indices)
      x = torch.cat([user_embeds, item_embeds], dim=1)

      x = self.fc1(x) # this calculates the weighted sum of all the neurons in the first layer, while taking into account their biases
      x = self.relu(x) # applying the leaky relu activation function to the weighted sum
      x = self.fc2(x) # applies another linear transformation
      return x

In [None]:
import torch
from torch import nn

# number of unique users in the training dataset
num_users_training = len(train_data.userID.unique())
# number of unique items in the traning dataset
num_items_training = len(train_data.movieID.unique())


# testing
# this part to be done after matrix factorization (to get optimal latent feature values. now still randomized)

# user embeddings
user_embedding = nn.Embedding(num_users_training, latent_features)
user_indices = torch.LongTensor([1,2,3]) # tensor containing user indices
user_embeds = user_embedding(user_indices)
print(user_embeds)
print("\n")

# item embeddings
item_embedding = nn.Embedding(num_items_training, latent_features)
item_indices = torch.LongTensor([1,2,3]) # tensor containing item indices
item_embeds = item_embedding(item_indices)
print(item_embeds)
print("\n")

all_user_item_pairs = torch.cartesian_prod(user_indices, item_indices)
# print(all_user_item_pairs)

all_user_embeds = user_embedding(all_user_item_pairs[:, 0])
all_item_embeds = item_embedding(all_user_item_pairs[:, 1])

# Concatenate user and item embeddings for each user-item pair
user_item_embeds = torch.cat((all_user_embeds, all_item_embeds), dim=1)

# print(user_item_embeds)
mf_vector = torch.mul(user_embeds, item_embeds)
print(mf_vector)

mf_vector = torch.mm(user_embeds, item_embeds.T)
print(mf_vector)



In [None]:
import torch
from torch import nn
# testing (matrix multiplication)
user_embedding_mf = nn.Embedding(num_users_training, latent_features)
item_embedding_mf = nn.Embedding(num_items_training, latent_features)
print(latent_features)
print(user_embedding_mf)
print(item_embedding_mf)
# Access the learned embeddings
user_embeddings = user_embedding_mf.weight.data

# Print the embeddings
print("User Embeddings:")
print(user_embeddings)

# Forward with a single user index
user_index = torch.tensor([9173])
user_embeds = user_embedding_mf(user_index)
print("User Embedding for index 377114:")
print(user_embeds)

item_index = torch.tensor([4632])
item_embeds = item_embedding_mf(item_index)
print("Item Embedding for index 9173:")
print(item_embeds)

# Concatenate user and item embeddings
combined_embeds = torch.cat([user_embeds, item_embeds], dim=1)
print("Combined Embeddings:")
print(combined_embeds)

# Linear transformation
linear_layer = nn.Linear(latent_features * 2, 1)
print(linear_layer)

# Access layer parameters
print("Weight matrix shape:", linear_layer.weight.shape)
print("Bias vector shape:", linear_layer.bias.shape)

# Apply linear layer to the combined embeddings
output = linear_layer(combined_embeds)
print("Output after Linear Transformation:")
print(output)


10
Embedding(9174, 10)
Embedding(4633, 10)
User Embeddings:
tensor([[-0.7093, -1.4774,  0.7135,  ..., -1.1078,  1.1718, -0.4468],
        [-0.2315, -1.8767, -0.3631,  ..., -0.9548,  0.6520,  1.3159],
        [ 0.2791, -0.2546,  1.4355,  ..., -1.1245, -1.1717,  0.7856],
        ...,
        [-0.0841, -0.6173, -0.5962,  ..., -1.7418, -0.4762, -0.1816],
        [-2.9315,  0.2006, -0.6173,  ..., -0.4202, -0.7122, -0.3279],
        [ 0.2435,  0.5883, -0.7190,  ..., -1.2386, -0.4424,  0.1986]])
User Embedding for index 377114:
tensor([[ 0.2435,  0.5883, -0.7190, -0.7269, -0.3127, -0.2542, -0.9676, -1.2386,
         -0.4424,  0.1986]], grad_fn=<EmbeddingBackward0>)
Item Embedding for index 9173:
tensor([[ 0.1444,  0.8047,  0.0154,  1.3510,  1.8176,  0.7132, -0.8718,  1.0490,
         -1.3116,  0.3669]], grad_fn=<EmbeddingBackward0>)
Combined Embeddings:
tensor([[ 0.2435,  0.5883, -0.7190, -0.7269, -0.3127, -0.2542, -0.9676, -1.2386,
         -0.4424,  0.1986,  0.1444,  0.8047,  0.0154,  1.351

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Dataset provides a standard interface for accessing and working with data in PyTorch
# DataLoader depends on Dataset instances for its operation.
from torch.optim.lr_scheduler import StepLR
from sklearn import model_selection, preprocessing, mean_squared_error
import pandas as pd


# Data Preprocessing
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

# Assuming df is your original DataFrame
df['userId'] = lbl_user.fit_transform(df['userId'].values)
df['movieId'] = lbl_movie.fit_transform(df['movieId'].values)

# Dataset Definition
# data loader class
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = torch.tensor(users, dtype=torch.long)
        self.movies = torch.tensor(movies, dtype=torch.long)
        self.ratings = torch.tensor(ratings, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return {
            'users': self.users[idx],
            'movies': self.movies[idx],
            'ratings': self.ratings[idx]
                }

# DataLoader Setup
# DataLoader is to help efficiently load and iterate over datasets during the training or evaluation
df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.1, random_state=42, stratify=df['rating'].values
)

train_dataset = MovieDataset(
    users=df_train['userId'].values,
    movies=df_train['movieId'].values,
    ratings=df_train['rating'].values
)

valid_dataset = MovieDataset(
    users=df_valid['userId'].values,
    movies=df_valid['movieId'].values,
    ratings=df_valid['rating'].values
)

train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True, num_workers=2)
validation_loader = DataLoader(dataset=valid_dataset, batch_size=4, shuffle=True, num_workers=2)

# Model Definition
# inherits from the nn.Module class
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super(RecSysModel, self).__init__()
        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)
        # This is a linear layer that takes the concatenated user adn movie embeddings as input (total input size is 32+32=64) and produces a single rating prediction using a linear transformation
        self.out = nn.Linear(64, 1)

    # computes predictions based on user and item embeddings
    def forward(self, users, movies):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        output = torch.cat([user_embeds, movie_embeds], dim=1)
        output = self.out(output)
        return output

# Model Initialization
model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes__),
).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Optimizer
optimizer = torch.optim.Adam(model.parameters())

# Scheduler
# The learning rate scheduler adjusts the learning rate of the optimizer during training. It will decrease the learning rate by a factor of gamma (0.7) every step_size (3) epochs.
scheduler = StepLR(optimizer, step_size=3, gamma=0.7)

# Loss Function that will be used is a mean-squared error
loss_func = nn.MSELoss()

# Training Loop
epochs = 1
total_loss = 0 # tracks progress and overall loss
plot_steps = 5000
print_steps = 5000
step_cnt = 0 # number of samples processed (number of batches * 4)
all_losses_list = []

# sets the model to training mode
model.train()

for epoch in range(epochs):

    for i, train_data in enumerate(train_loader): # train_loader is a DataLoader object that provides the batches of training data (users,movies,ratings) to the model
        output = model(train_data["users"], train_data["movies"])
        rating = train_data["ratings"].view(4, -1).to(torch.float32)
        loss = loss_func(output, rating) # computes the loss (error) between the predicted rating and the actual rating
        total_loss += loss.sum().item()

        # backward pass and parameter update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_cnt += len(train_data["users"])

        # prints out the loss after
        if step_cnt % plot_steps == 0:
            avg_loss = total_loss / (len(train_data["users"]) * plot_steps)
            print(f"epoch {epoch} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0

model_output_list = [] # predicted rating list
target_rating_list = [] # actual rating list

model.eval()  # sets model to evaluation mode

with torch.no_grad():
    for i, batched_data in enumerate(train_loader):
        model_output = model(batched_data['users'], batched_data['movies'])
        model_output_list.append(model_output.sum().item() / len(batched_data['users']))

        target_rating = batched_data['ratings']
        target_rating_list.append(target_rating.sum().item() / len(batched_data['users']))

        print(f"model_output: {model_output}, target_rating: {target_rating}")

# If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")