In [1]:
#Importing Google Drive in which datasets are stored
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from scipy.sparse.linalg import svds
import h5py
from scipy.sparse import csc_matrix
from time import time
import copy

# Data Pre-processing

In [3]:
#Function to import MovieLens_100K dataset
def load_data_100k(path='./', delimiter='\t'):

    #Loading ratings data
    ratings_data = pd.read_csv('/content/drive/MyDrive/Datasets/ratings.csv')

    #Creating dictionaries to map original user and movie IDs to zero-based indices
    user_id_to_index = {user_id: i for i, user_id in enumerate(ratings_data['userId'].unique())}
    movie_id_to_index = {movie_id: i for i, movie_id in enumerate(ratings_data['movieId'].unique())}

    num_users = len(user_id_to_index)  #Calculating number of users
    num_movies = len(movie_id_to_index)  #Calculating number of movies

    #Splitting the data into training and test sets
    train_data, test_data = train_test_split(ratings_data, test_size=0.2, random_state=42)

    #Initialization of train_ratings and test_ratings as two-dimensional arrays filled with zeros
    train_ratings = np.zeros((num_movies, num_users), dtype='float32')
    test_ratings = np.zeros((num_movies, num_users), dtype='float32')

    #Extraction of user, movie and ratings data from each data point in training dataset and and this extracted rating is stored in train_ratings matrix at corresponding user-movie position
    for index, row in train_data.iterrows():
        user_id = user_id_to_index[row['userId']]
        movie_id = movie_id_to_index[row['movieId']]
        rating = row['rating']

        train_ratings[movie_id, user_id] = rating

    #Extraction of user, movie and ratings data from each data point in test dataset and and this extracted rating is stored in test_ratings matrix at corresponding user-movie position
    for index, row in test_data.iterrows():
        user_id = user_id_to_index[row['userId']]
        movie_id = movie_id_to_index[row['movieId']]
        rating = row['rating']

        test_ratings[movie_id, user_id] = rating

    #Creating of binary masks for training and test datasets where 0 in this mask represents no rating and 1 in this mask reprsents that a rating exists
    train_masks = np.greater(train_ratings, 1e-12).astype('float32')
    test_masks = np.greater(test_ratings, 1e-12).astype('float32')

    #Displaying confirmating of datasets being loaded in matrices, number of users, number of movies, number of training ratings and number of test ratings
    print('Data matrix loaded')
    print('Number of users: {}'.format(num_users))
    print('Number of movies: {}'.format(num_movies))
    print('Number of training ratings:', train_data.shape[0])
    print('Number of test ratings:', test_data.shape[0])

    #Returning number of movies value, number of users value, train_ratings, test_ratings matrices and the two binary masks created above
    return num_movies, num_users, train_ratings, train_masks, test_ratings, test_masks

In [4]:
#Loading Data
path = '/content/drive/MyDrive/Datasets/MovieLens_100K/'
n_m, n_u, train_r, train_m, test_r, test_m = load_data_100k(path=path, delimiter='\t')

Data matrix loaded
Number of users: 610
Number of movies: 9724
Number of training ratings: 80668
Number of test ratings: 20168


# Training the Basic Model

In [5]:
#Separating Validation Data from Training Dataset
train_r, val_r, train_m, val_m = train_test_split(train_r, train_m, test_size=0.2, random_state=42)

class MatrixFactorization:
    #Initialization of class MatrixFactorization with hyperparameters number of latent factors, learning rate and number of training epochs
    def __init__(self, n_factors=10, learning_rate=0.01, num_epochs=100):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

    def fit(self, train_r, train_m):
        #Declaring self.n_users and self.n_items from the shape of train_r
        self.n_users, self.n_items = train_r.shape
        #Initialization of two matrices with random values which represent latent factors for users and items respectively
        self.P = np.random.rand(self.n_users, self.n_factors)
        self.Q = np.random.rand(self.n_items, self.n_factors)

        for epoch in range(self.num_epochs):
            for i in range(self.n_users):
                for j in range(self.n_items):
                    if train_m[i, j] == 1:  #Checking if rating exists at ith row and jth column by looking in binary mask matrix
                        eij = train_r[i, j] - np.dot(self.P[i, :], self.Q[j, :]) #Calculating the prediction error
                        #Updating the latent factors
                        for k in range(self.n_factors):
                            self.P[i, k] += self.learning_rate * (2 * eij * self.Q[j, k])
                            self.Q[j, k] += self.learning_rate * (2 * eij * self.P[i, k])

    #Function to predict the ratings
    def predict(self, data):
        user_indices, item_indices = data[:, 0].astype(int), data[:, 1].astype(int)
        predictions = np.dot(self.P, self.Q.T)
        return predictions[user_indices, item_indices]

In [6]:
#Creating the matrix factorization model and fitting it on the training data
model = MatrixFactorization(n_factors=10, learning_rate=0.01, num_epochs=20)
model.fit(train_r, train_m)

# Validation

In [7]:
#Making predictions on the validation and test datasets
val_pred = model.predict(val_r)
test_pred = model.predict(test_r)

#Defining a threshold to classify as like or not like
threshold = 3.5

#Initializing variables to keep track of correct and total predictions
correct_predictions = 0
total_predictions = 0

#Initialization of empty val_errors array to store validation dataset errors for each user-item pair
val_errors = []
for i in range(val_r.shape[0]):
    for j in range(val_r.shape[1]):
        if val_m[i, j]: #Checking if rating exists at ith row and jth column by looking in binary mask matrix
            actual_rating = val_r[i, j]
            predicted_rating = val_pred[i]
            val_errors.append((actual_rating - predicted_rating) ** 2) #Calculating the squared error between actual and predicted rating
            #Classifying the predicted rating as like (1) or not like (0) based on the threshold
            predicted_label = 1 if predicted_rating >= threshold else 0

            #Comparing the actual label with the predicted label
            if actual_rating >= threshold and predicted_label == 1:
                correct_predictions += 1
            elif actual_rating < threshold and predicted_label == 0:
                correct_predictions += 1

            total_predictions += 1

#Calculating the validation accuracy
validation_accuracy = correct_predictions / total_predictions

val_rmse = np.sqrt(np.mean(val_errors)) #Calcualting RMSE for validation dataset

#Initialization of empty test_errors array to store validation dataset errors for each user-item pair
test_errors = []
for i in range(test_r.shape[0]):
    for j in range(test_r.shape[1]):
        if test_m[i, j]: #Checking if rating exists at ith row and jth column by looking in binary mask matrix
            actual_rating = test_r[i, j]
            predicted_rating = test_pred[i]
            test_errors.append((actual_rating - predicted_rating) ** 2) #Calculating the squared error between actual and predicted rating

test_rmse = np.sqrt(np.mean(test_errors)) #Calcualting RMSE for test dataset

#Displaying validation accuracy, validation RMSE and test RMSE
print("Validation Accuracy:", validation_accuracy)
print("Validation RMSE:", val_rmse)
print("Test RMSE:", test_rmse)

Validation Accuracy: 0.5713220212567592
Validation RMSE: 1.6038169957528507
Test RMSE: 1.6039513725621861
