In [1]:
#Importing Google Drive in which datasets are stored
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Data Pre-processing

In [27]:
#Loading the data
ratings_data = pd.read_csv('/content/drive/MyDrive/Datasets/ratings.csv')
movies_data = pd.read_csv('/content/drive/MyDrive/Datasets/movies.csv')
ratings_data = pd.merge(ratings_data, movies_data[['movieId', 'genres']], on='movieId', how='left')

#Encoding the labels for user IDs, movie IDs and movie genres
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
genre_encoder = LabelEncoder()
ratings_data['userId'] = user_encoder.fit_transform(ratings_data['userId'])
ratings_data['movieId'] = movie_encoder.fit_transform(ratings_data['movieId'])
ratings_data['genres'] = genre_encoder.fit_transform(ratings_data['genres'])

In [28]:
#Separating first 25,00 rows of dataframe to train Matrix Factorisation Model
ratings_data2 = ratings_data.head(25000)

#Function to pre-process data for Matrix Factorisation Model
def load_data_100k(path='./', delimiter='\t'):

    #Creating dictionaries to map original user, movie IDs and genres to zero-based indices
    user_id_to_index = {user_id: i for i, user_id in enumerate(ratings_data2['userId'].unique())}
    movie_id_to_index = {movie_id: i for i, movie_id in enumerate(ratings_data2['movieId'].unique())}
    genre_id_to_index = {genre_id: i for i, genre_id in enumerate(ratings_data2['genres'].unique())}

    num_users = len(user_id_to_index)  #Calculating number of users
    num_movies = len(movie_id_to_index)  #Calculating number of movies
    num_genres = len(genre_id_to_index) #Calculating number of genres

    #Splitting the training and test data
    train_data, test_data = train_test_split(ratings_data2, test_size=0.2, random_state=24)

    #Initialization of train_ratings and test_ratings as three-dimensional arrays filled with zeros
    train_ratings = np.zeros((num_movies, num_users, num_genres), dtype='float32')
    test_ratings = np.zeros((num_movies, num_users, num_genres), dtype='float32')

    #Extraction of user, movie, genres and ratings data from each data point in training dataset and and this extracted rating is stored in train_ratings matrix at corresponding user-movie-genres position
    for index, row in train_data.iterrows():
        user_id = user_id_to_index[row['userId']]
        movie_id = movie_id_to_index[row['movieId']]
        genre_id = genre_id_to_index[row['genres']]
        rating = row['rating']

        train_ratings[movie_id, user_id] = rating

    #Extraction of user, movie, genres and ratings data from each data point in test dataset and and this extracted rating is stored in test_ratings matrix at corresponding user-movie-genres position
    for index, row in test_data.iterrows():
        user_id = user_id_to_index[row['userId']]
        movie_id = movie_id_to_index[row['movieId']]
        genre_id = genre_id_to_index[row['genres']]
        rating = row['rating']

        test_ratings[movie_id, user_id] = rating

    #Creating of binary masks for training and test datasets where 0 in this mask represents no rating and 1 in this mask reprsents that a rating exists
    train_masks = np.greater(train_ratings, 1e-12).astype('float32')
    test_masks = np.greater(test_ratings, 1e-12).astype('float32')

    #Displaying confirmating of datasets being loaded in matrices, number of users, number of movies, number of training ratings and number of test ratings
    print('Data matrix loaded')
    print('Number of users: {}'.format(num_users))
    print('Number of movies: {}'.format(num_movies))
    print('Number of training ratings:', train_data.shape[0])
    print('Number of test ratings:', test_data.shape[0])

    #Returning number of movies value, number of users value, train_ratings, test_ratings matrices, the two binary masks and training and test data created above
    return num_movies, num_users, num_genres, train_ratings, train_masks, test_ratings, test_masks, train_data, test_data

In [29]:
#Loading Data
path = '/content/drive/MyDrive/Datasets/MovieLens_100K/'
n_m, n_u, n_g, train_r, train_m, test_r, test_m, train_data2, test_data2 = load_data_100k(path=path, delimiter='\t')

Data matrix loaded
Number of users: 177
Number of movies: 5194
Number of training ratings: 20000
Number of test ratings: 5000


# Training the Neural Network and Matrix Factorisation Models

In [13]:
#Splitting the training and test data for neural network model
train_data, test_data = train_test_split(ratings_data, test_size=0.2, random_state=24)

#To ensure that rating is of type category
train_data['rating'] = train_data['rating'].astype('category')

#Creating the neural network collaborative filtering model
def create_model():
    #Defining the input layers
    user_input = Input(shape=(1,))
    movie_input = Input(shape=(1,))
    genre_input = Input(shape=(1,))

    #Defining embedding layers
    user_embedding = Embedding(input_dim=len(user_encoder.classes_), output_dim=20)(user_input)
    movie_embedding = Embedding(input_dim=len(movie_encoder.classes_), output_dim=20)(movie_input)
    genre_embedding = Embedding(input_dim=len(genre_encoder.classes_), output_dim=5)(genre_input)

    #Flattening the embedding layers
    user_flatten = Flatten()(user_embedding)
    movie_flatten = Flatten()(movie_embedding)
    genre_flatten = Flatten()(genre_embedding)

    #Concatenating the flattened embeddings
    concatenated = Concatenate()([user_flatten, movie_flatten, genre_flatten])

    #Defining the dense layers
    dense1 = Dense(128, activation='tanh')(concatenated)
    dense1 = BatchNormalization()(dense1)
    dense1 = Dropout(0.5)(dense1)

    dense2 = Dense(64, activation='sigmoid')(dense1)
    dense2 = BatchNormalization()(dense2)
    dense2 = Dropout(0.3)(dense2)

    dense3 = Dense(64, activation='relu')(dense2)
    dense3 = BatchNormalization()(dense3)
    dense3 = Dropout(0.3)(dense3)

    #Output layer for classification
    output = Dense(len(train_data['rating'].cat.categories), activation='softmax')(dense2)

    #Creating and compiling the model
    model = Model(inputs=[user_input, movie_input, genre_input], outputs=output)
    model.compile(optimizer=Adam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

#Instantiating the neural network model
model = create_model()



In [14]:
#Training the neural network model
model.fit([train_data['userId'], train_data['movieId'], train_data['genres']],
                         train_data['rating'].cat.codes,
                         epochs=50, batch_size=64, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7b24d036b460>

In [6]:
#Separating Validation Data from Training Dataset
train_r, val_r, train_m, val_m = train_test_split(train_r, train_m, test_size=0.2, random_state=42)

#Class for Matrix Factorisation Model
class MatrixFactorization:
    #Initialization of class MatrixFactorization with hyperparameters number of latent factors, learning rate and number of training epochs
    def __init__(self, n_factors=10, learning_rate=0.01, num_epochs=100):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

    def fit(self, train_r, train_m):
        #Declaring self.n_users, self.n_items and self.n_genres from the shape of train_r
        self.n_users, self.n_items, self.n_genres = train_r.shape

        #Initialization of three matrices with random values which represent latent factors for users, items and genres respectively
        self.P = np.random.rand(self.n_users, self.n_factors)
        self.Q = np.random.rand(self.n_items, self.n_factors)
        self.R = np.random.rand(self.n_genres, self.n_factors)

        for epoch in range(self.num_epochs):
            for i in range(self.n_users):
                for j in range(self.n_items):
                    for k in range(self.n_genres):
                        if train_m[i, j, k] == 1: #Checking if rating exists at [i,j,k] position by looking in binary mask matrix
                            eij = train_r[i, j, k] - np.sum(np.dot(self.P[i, :], self.Q[j, :]) * self.R[k, :]) #Calculating the prediction error
                            #Updating the latent factors
                            for l in range(self.n_factors):
                                self.P[i, l] += self.learning_rate * (2 * eij * self.Q[j, l] * self.R[k, l])
                                self.Q[j, l] += self.learning_rate * (2 * eij * self.P[i, l] * self.R[k, l])
                                self.R[k, l] += self.learning_rate * (2 * eij * self.P[i, l] * self.Q[j, l])

    #Function to predict the ratings
    def predict(self, data):
        user_indices, item_indices, genre_indices = data[:, 0].astype(int), data[:, 1].astype(int), data[:, 2].astype(int)
        predictions = np.zeros_like(data[:, 0], dtype=float)
        for idx in range(len(user_indices)):
            i, j, k = user_indices[idx], item_indices[idx], genre_indices[idx]
            predictions[idx] = np.sum(np.dot(self.P[i, :], self.Q[j, :]) * self.R[k, :])
        return predictions

In [7]:
#Creating the matrix factorization model and fitting it on the training data
model2 = MatrixFactorization(n_factors=10, learning_rate=0.01, num_epochs=20)
model2.fit(train_r, train_m)

# Validation

In [8]:
#Making predictions on the validation and test datasets
val_pred = model2.predict(val_r)
test_pred = model2.predict(test_r)

#Defining a threshold to classify as like or not like
threshold = 3.5

#Initializing variables to keep track of correct and total predictions
correct_predictions = 0
total_predictions = 0

#Initialization of empty val_errors array to store validation dataset errors for each user-item-genre pair
val_errors = []
for i in range(val_r.shape[0]):
    for j in range(val_r.shape[1]):
      for k in range(val_r.shape[2]):
        if val_m[i, j, k]: #Checking if rating exists at [i,j,k] by looking in binary mask matrix
            actual_rating = val_r[i, j, k]
            predicted_rating = val_pred[i]
            val_errors.append((actual_rating - predicted_rating) ** 2) #Calculating the squared error between actual and predicted rating
            #Classifying the predicted rating as like (1) or not like (0) based on the threshold
            predicted_label = 1 if predicted_rating >= threshold else 0

            #Comparing the actual label with the predicted label
            if actual_rating >= threshold and predicted_label == 1:
                correct_predictions += 1
            elif actual_rating < threshold and predicted_label == 0:
                correct_predictions += 1

            total_predictions += 1

#Calculating the validation accuracy
validation_accuracy = correct_predictions / total_predictions

val_rmse = np.sqrt(np.mean(val_errors)) #Calcualting RMSE for validation dataset

#Initialization of empty test_errors array to store validation dataset errors for each user-item pair
test_errors = []
for i in range(test_r.shape[0]):
    for j in range(test_r.shape[1]):
      for k in range(test_r.shape[2]):
        if test_m[i, j, k]: #Checking if rating exists at [i,j,k] by looking in binary mask matrix
              actual_rating = test_r[i, j, k]
              predicted_rating = test_pred[i]
              test_errors.append((actual_rating - predicted_rating) ** 2) #Calculating the squared error between actual and predicted rating

test_rmse = np.sqrt(np.mean(test_errors)) #Calcualting RMSE for test dataset

#Displaying validation accuracy, validation RMSE and test RMSE
print("Validation Accuracy:", validation_accuracy)
print("Validation RMSE:", val_rmse)
print("Test RMSE:", test_rmse)

Validation Accuracy: 0.5969917334825036
Validation RMSE: 1.6002473248403473
Test RMSE: 1.554342906088067
