<a href="https://colab.research.google.com/github/yckamra/MovieLens25m-Recommender/blob/main/Benchmarking_vFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Installing project dependencies...")
!pip install numpy pandas scipy scikit-learn # Basic libraries we want
!pip install implicit # Installs implicit library for matrix factorization
#!pip install torch
print("Dependencies installed successfully.")

Installing project dependencies...
Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Dependencies installed successfully.


In [None]:
import math
from google.colab import auth
from google.cloud import storage
import zipfile
import os
import pickle
import pandas as pd
import numpy as np
import implicit
import json
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, csr_matrix # Compressed Sparse Row
import random

In [None]:
def precision_k(predictionList, actualList, k): # predictionList and actualList is List of movieIds

  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  # Resolves division by zero
  if k <= 0:
    return 0

  collision_map = {}
  similar_movies = 0

  for movie in noDupesPredictionList: # Puts all movie predictions in a dictionary
    if movie not in collision_map:
      collision_map[movie] = movie

  for movie in noDupesActualList:
    if movie in collision_map:
      similar_movies += 1 # Increments movies in both lists when there is a collision

  precision_k = similar_movies / k

  return precision_k

In [None]:
def recall_k(predictionList, actualList, k):

  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  # Resolves division by zero
  if len(noDupesActualList) == 0:
    return 0

  collision_map = {}
  similar_movies = 0

  for movie in noDupesPredictionList: # Puts all movie predictions in a dictionary
    if movie not in collision_map:
      collision_map[movie] = movie

  for movie in noDupesActualList:
    if movie in collision_map:
      similar_movies += 1 # Increments movies in both lists when there is a collision

  recall_k = similar_movies / len(noDupesActualList)

  return recall_k

In [None]:
def f1_k(predictionList, actualList, k):

  precision = precision_k(predictionList, actualList, k)
  recall = recall_k(predictionList, actualList, k)
  if precision == 0.0 and recall == 0.0:
    return 0.0
  else:
    f1_score = 2 * ((precision * recall) / (precision + recall))

  return f1_score

In [None]:
def DCG_k(predictionList, actualList, k): # This form of DCG uses binary relevance (1 if rated and 0 if not), and could use ratings for improved rankings
  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  sum_of_DCG = 0

  if k > len(noDupesPredictionList):
    k = len(noDupesPredictionList)

  for i in range(1,k+1): # If k = 5, then i = 1, 2, 3, 4, 5
    index_in_list = i - 1
    curr_movie = noDupesPredictionList[index_in_list]
    relevance = 0
    if curr_movie in noDupesActualList:
      relevance = 1
    else:
      relevance = 0
    DCG = relevance / (math.log2(i+1))
    sum_of_DCG += DCG

  return sum_of_DCG

In [None]:
def IDCG_k(predictionList, actualList, k):

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  sum_of_IDCG = 0

  if k > len(noDupesActualList):
    k = len(noDupesActualList)

  for i in range(1,k+1): # If k = 5, then i = 1, 2, 3, 4, 5
    index_in_list = i - 1
    relevance = 1
    IDCG = relevance / (math.log2(i+1))
    sum_of_IDCG += IDCG

  return sum_of_IDCG

In [None]:
def NDCG_k(predictionList, actualList, k): # Normalized Discounted Cumulative Gain

  DCG = DCG_k(predictionList, actualList, k)
  IDCG = IDCG_k(predictionList, actualList, k)

  if IDCG == 0:
    return 0

  NDCG = DCG / IDCG

  return NDCG

In [None]:
def contains_relevant_item(predictionList, actualList, k):
  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  collision_map = {}

  for movie in noDupesPredictionList: # Puts all movie predictions in a dictionary
    if movie not in collision_map:
      collision_map[movie] = movie

  for movie in noDupesActualList:
    if movie in collision_map:
      return True

  return False

In [None]:
def mean_precision_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs and allPredictions is a dictionary with (userId, predictionList)
  total_users = 0
  sum_of_precisions = 0

  for user in allUsers: # user is a userID so we can use it for both allUsers and allPredictions
    total_users += 1
    sum_of_precisions += precision_k(allPredictions[user], allUsers[user].get_sorted_movies(4.0), k)

  if total_users == 0:
    return 0

  mean_precision = sum_of_precisions / total_users

  return mean_precision

In [None]:
def mean_recall_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  total_users = 0
  sum_of_recalls = 0

  for user in allUsers:
    total_users += 1
    sum_of_recalls += recall_k(allPredictions[user], allUsers[user].get_sorted_movies(4.0), k)

  if total_users == 0:
    return 0

  mean_recall = sum_of_recalls / total_users

  return mean_recall

In [None]:
def mean_f1_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  total_users = 0
  sum_of_f1 = 0

  for user in allUsers:
    total_users += 1
    sum_of_f1 += f1_k(allPredictions[user], allUsers[user].get_sorted_movies(4.0), k)

  if total_users == 0:
    return 0

  mean_f1 = sum_of_f1 / total_users

  return mean_f1

In [None]:
def mean_NDCG_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  total_users = 0
  sum_of_NDCG = 0

  for user in allUsers:
    total_users += 1
    sum_of_NDCG += NDCG_k(allPredictions[user], allUsers[user].get_sorted_movies(4.0), k)

  if total_users == 0:
    return 0

  mean_NDCG = sum_of_NDCG / total_users

  return mean_NDCG

In [None]:
def total_hit_rate(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  hit_rate = 0
  num_users = 0

  for user in allUsers:
    if contains_relevant_item(allPredictions[user], allUsers[user].get_sorted_movies(4.0), k):
      hit_rate += 1
    num_users += 1

  return hit_rate, num_users

In [None]:
def evaluate_model(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs and predictions is a 2D matrix

  precision = mean_precision_k(allUsers, allPredictions, k)
  recall = mean_recall_k(allUsers, allPredictions, k)
  f1 = mean_f1_k(allUsers, allPredictions, k)
  NDCG = mean_NDCG_k(allUsers, allPredictions, k)
  hit_rate, num_users = total_hit_rate(allUsers, allPredictions, k)

  return precision, recall, f1, NDCG, hit_rate, num_users

In [None]:
auth.authenticate_user()

!gcloud config set project movielens-recommender-451017

!gsutil cp gs://movielens-data/allMovies_data.pkl /content/
!gsutil cp gs://movielens-data/allUsers_data.pkl /content/
!gsutil cp -r gs://movielens-data/collaborative_filtering /content/

Updated property [core/project].
Copying gs://movielens-data/allMovies_data.pkl...
- [1 files][ 19.0 MiB/ 19.0 MiB]                                                
Operation completed over 1 objects/19.0 MiB.                                     
Copying gs://movielens-data/allUsers_data.pkl...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

- [1 files][995.9 MiB/995.9 MiB]                                                
Operation completed over 1 objects/995.9 MiB.                                    
Copying gs://movielens-data/collaborative_filtering/matrix_factorization_model.pkl...
Copying gs://movielens-data/collaborative_filtering/movie_id_map.pkl...
Copying gs://movielens-data/collaborative_filtering/test_matrix_csr.pkl...
\ [3 files][129.1 MiB/129.1 MiB]                                

In [None]:
!ls -lh /content/

total 1015M
-rw-r--r-- 1 root root  19M Jun 29 13:43 allMovies_data.pkl
-rw-r--r-- 1 root root 996M Jun 29 13:44 allUsers_data.pkl
drwxr-xr-x 2 root root 4.0K Jun 29 13:44 collaborative_filtering
drwxr-xr-x 1 root root 4.0K Jun 26 13:35 sample_data


In [None]:
class Movie:
  # Items in self:
  #     ID
  #     name
  #     genre_list
  #     tag_list
  #     number_of_ratings -- total amount this movies has been rated
  #     sum_of_ratings -- summation of all the ratings from all users for this movie
  #     year
  #     imdbId
  #     tmdbId
  #     actors_list
  #     tmdb_tag_list

  def __init__(self):
    self.ID = None
    self.title = None
    self.genre_list = []
    self.tag_list = []
    self.number_of_ratings = 0
    self.sum_of_ratings = 0
    self.year = None
    self.imdbId = None
    self.tmdbId = None
    self.actors_list = []
    self.tmdb_tag_list = []

  def set_tmdb_tag_list(self, tags):
    self.tmdb_tag_list = tags

  def add_to_tmdb_tag_list(self, tags):
    self.tmdb_tag_list = self.tmdb_tag_list + tags

  def get_tmdb_tag_list(self):
    return self.tmdb_tag_list

  def set_actors_list(self, actors_list):
    self.actors_list = actors_list

  def add_actor_to_actors_list(self, actor):
    self.actors_list.append(actor)

  def get_actors_list(self):
    return self.actors_list

  def set_imdbId(self, imbdId):
    self.imbdId = imbdId

  def get_imdbId(self):
    return self.imdbId

  def set_tmdbId(self, tmdbId):
    self.tmdbId = tmdbId

  def get_tmdbId(self):
    return self.tmdbId

  def set_year(self, year):
    self.year = year

  def get_year(self):
    return self.year

  def get_tag_list(self):
    return self.tag_list

  def add_tag_to_tag_list(self, tag):
    self.tag_list.append(tag)

  def get_ID(self):
    return self.ID

  def set_ID(self, ID):
    self.ID = ID

  def set_title(self, title):
    self.title = title

  def get_title(self):
    return self.title

  def set_genre_list(self, genre_list):
    self.genre_list = genre_list

  def add_genre_to_genre_list(self, genre):
    self.genre_list.append(genre)

  def get_genre_list(self):
    return self.genre_list

  def get_number_of_ratings(self):
    return self.number_of_ratings

  def set_number_of_ratings(self, number):
    self.number_of_ratings = number

  def add_to_number_of_ratings(self):
    self.number_of_ratings += 1

  def get_sum_of_ratings(self):
    return self.sum_of_ratings

  def set_sum_of_ratings(self, number):
    self.sum_of_ratings = number

  def add_rating_to_sum_of_ratings(self, rating):
    self.sum_of_ratings += rating

  def calculate_average_rating(self):
    if self.number_of_ratings == 0:
      return 0
    else:
      return self.sum_of_ratings / self.number_of_ratings

In [None]:
class User:
  # ---Items in self---
  # userId : ID in MovieLens 25m dataset
  # ratings_df : pandas dataframe containing the users rows within the MovieLens 25m ratings.csv

  def __init__(self, userID=None, ratings_df=None):
    self.userID = userID
    self.ratings_df = ratings_df if ratings_df is not None else pd.DataFrame()

  def get_userID(self):
    return self.userID

  def set_userID(self, ID : int):
    assert isinstance(ID, int), "ID must be an integer"
    self.userID = int(ID)

  def add_row_to_ratings_df(self, row):

    if row["movieId"] in self.ratings_df["movieId"].values:
      return False
    else:
      self.ratings_df = pd.concat([self.ratings_df, pd.DataFrame([row])], ignore_index=True)

    return

  def get_sorted_movies(self, rating_threshold): # Returns Users movies with popularity in descending order, rating_threshold only includes ratings above or equal to the threshold
    filtered_df = self.ratings_df[self.ratings_df['rating'] >= rating_threshold]
    sorted_df = filtered_df.sort_values(by='rating', ascending=False)

    df_list = list(sorted_df["movieId"])

    return df_list

In [None]:
# Loading pickle file
file_path = '/content/allMovies_data.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    allMovies = pickle.load(f)
print(f"\nMovie dictionary loaded from {file_path}")


Movie dictionary loaded from /content/allMovies_data.pkl


In [None]:
# Verify loaded data
print(f"Loaded movie '1' title: {allMovies[1].get_title()}")
print(f"Loaded movie '1' genre list: {allMovies[1].get_genre_list()}")
print(f"Loaded movie '1' tags: {allMovies[1].get_tag_list()}")

Loaded movie '1' title: Toy Story (1995)
Loaded movie '1' genre list: ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']
Loaded movie '1' tags: ['Owned', 'imdb top 250', 'Pixar', 'Pixar', 'time travel', 'children', 'comedy', 'funny', 'witty', 'rated-G', 'animation', 'Pixar', 'computer animation', 'good cartoon chindren', 'pixar', 'friendship', 'bright', 'DARING RESCUES', 'fanciful', 'HEROIC MISSION', 'humorous', 'light', 'rousing', 'TOYS COME TO LIFE', 'UNLIKELY FRIENDSHIPS', 'warm', 'witty', 'animation', 'Disney', 'friendship', 'pixar', 'boy', 'boy next door', 'bullying', 'friends', 'friendship', 'jealousy', 'martial arts', 'mission', 'neighborhood', 'new toy', 'pixar', 'rescue', 'resourcefulness', 'rivalry', 'toy', 'toy comes to life', 'walkie talkie', 'clever', 'animation', 'clever', 'friendship', 'funny', 'Tom Hanks', 'witty', 'pixar', 'Pixar', 'toys', 'animation', 'fun', 'children', 'pixar', 'Tom Hanks', 'dolls', 'National Film Registry', 'adventure', 'animated', 'animati

In [None]:
# Loading pickle file
file_path = '/content/allUsers_data.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    allUsers = pickle.load(f)
print(f"\nUser dictionary loaded from {file_path}")


User dictionary loaded from /content/allUsers_data.pkl


In [None]:
# Verify loaded data
print(f"Loaded user '1' ratings_df: {allUsers[1].ratings_df.head()}")
print(f"Loaded user '1' movie list: {allUsers[1].get_sorted_movies(4.0)}")

Loaded user '1' ratings_df:    userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510
Loaded user '1' movie list: [296, 307, 665, 1237, 3569, 2632, 2692, 8154, 7361, 32591, 3949, 4325, 5767, 4144, 6016, 6711, 8327, 7234, 7940, 27266, 4973, 2351, 2843, 6370, 1653, 1088, 1250, 2573, 3448, 5147, 4703, 5952, 5878, 6377, 7209, 7365, 8360, 8973, 8786]


In [None]:
def popularity_based_recommendation(movie_dictionary):
  list_of_movies = []
  list_of_popularity = []
  for key in movie_dictionary:
    list_of_movies.append(movie_dictionary[key].get_ID())
    list_of_popularity.append(movie_dictionary[key].calculate_average_rating())
  combined = zip(list_of_popularity, list_of_movies)
  sorted_pairs = sorted(combined, reverse=True)
  sorted_popularity, sorted_movies = zip(*sorted_pairs)
  return sorted_movies # Returns the movie IDs in descending order (most popular first)

def weighted_rating_popularity_based_recommendation(movie_dictionary, threshold): # Weighted Rating: WR = (v/(v+m))R + (m/(v+m))C, threshold should be ratings_df.groupby('movieId').size().quantile(0.80) so higher threshold is typically advised (ie. 20)
  number_of_ratings = 0
  sum_of_ratings = 0
  for key in movie_dictionary:
    if movie_dictionary[key].get_number_of_ratings() != 0:
      number_of_ratings += movie_dictionary[key].get_number_of_ratings()
      sum_of_ratings += movie_dictionary[key].get_sum_of_ratings()

  global_mean_rating = sum_of_ratings / number_of_ratings
  list_of_movies = []
  list_of_weighted_popularity = []

  for key in movie_dictionary:
    list_of_movies.append(movie_dictionary[key].get_ID()) # Add movie ID to list in order to return back to user
    curr_movie_num_ratings = movie_dictionary[key].get_number_of_ratings() # Get the current movie's number of ratings
    min_num_of_votes_threshold = threshold # The number of ratings a movie needs to be considered and not just averaged out
    curr_movie_average_rating = movie_dictionary[key].calculate_average_rating()
    weighted_rating = ((curr_movie_num_ratings / (curr_movie_num_ratings + min_num_of_votes_threshold)) * curr_movie_average_rating) + ((min_num_of_votes_threshold / (curr_movie_num_ratings + min_num_of_votes_threshold)) * global_mean_rating)
    list_of_weighted_popularity.append(weighted_rating)

  combined = zip(list_of_weighted_popularity, list_of_movies)
  sorted_pairs = sorted(combined, reverse=True)
  sorted_weighted_popularity, sorted_movies = zip(*sorted_pairs)

  return sorted_movies # Returns the movie IDs in descending order (most popular first)

In [None]:
print("-----Popularity Model-----")
movies_recommended = popularity_based_recommendation(allMovies)
sneak_peak_popularity = movies_recommended[:10]
for ID in sneak_peak_popularity:
  print(f"title: {allMovies[ID].get_title()}")
  print(f"number of ratings: {allMovies[ID].get_number_of_ratings()}")
  print(f"average rating: {allMovies[ID].calculate_average_rating()}\n")

print("-----Weighted Popularity Model-----")
movies_recommended_weighted = weighted_rating_popularity_based_recommendation(allMovies, 20)
sneak_peak_weighted_popularity = movies_recommended_weighted[:10]
for ID in sneak_peak_weighted_popularity:
  print(f"title: {allMovies[ID].get_title()}")
  print(f"number of ratings: {allMovies[ID].get_number_of_ratings()}")
  print(f"average rating: {allMovies[ID].calculate_average_rating()}\n")

-----Popularity Model-----
title: Santosh Subramaniam (2008)
number of ratings: 1
average rating: 5.0

title: Screen Play (1992)
number of ratings: 1
average rating: 5.0

title: Zana (2019)
number of ratings: 1
average rating: 5.0

title: Punk the Capital: Building a Sound Movement
number of ratings: 1
average rating: 5.0

title: Kaithi (2019)
number of ratings: 2
average rating: 5.0

title: War (2019)
number of ratings: 1
average rating: 5.0

title: Rolli and the Golden Key (2013)
number of ratings: 1
average rating: 5.0

title: The Head (2004)
number of ratings: 1
average rating: 5.0

title: Everything Is Terrible! Presents: The Great Satan (2017)
number of ratings: 1
average rating: 5.0

title: The Fallen of World War II (2015)
number of ratings: 1
average rating: 5.0

-----Weighted Popularity Model-----
title: Planet Earth II (2016)
number of ratings: 1124
average rating: 4.483096085409253

title: Planet Earth (2006)
number of ratings: 1747
average rating: 4.464796794504865

title:

Loading the below in for collaborative filtering:
1. user_id_map
2. movie_id_map
3. user_item_matrix_csr
4. train_matrix_csr
5. test_matrix_csr

In [None]:
# Loading matrix factorization pickle file
file_path = '/content/collaborative_filtering/matrix_factorization_model.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    matrix_factorization_model = pickle.load(f)
print(f"\nMatrix factorization model loaded from {file_path}")

# Loading movie_id_map pickle file
file_path = '/content/collaborative_filtering/movie_id_map.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    movie_id_map = pickle.load(f)
print(f"\nmovie_id_map loaded from {file_path}")

# Loading user_id_map pickle file
file_path = '/content/collaborative_filtering/user_id_map.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    user_id_map = pickle.load(f)
print(f"\nuser_id_map loaded from {file_path}")

# Loading train_matrix_csr pickle file
file_path = '/content/collaborative_filtering/train_matrix_csr.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    train_matrix_csr = pickle.load(f)
print(f"\ntrain_matrix_csr loaded from {file_path}")

# Loading test_matrix_csr pickle file
file_path = '/content/collaborative_filtering/test_matrix_csr.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    test_matrix_csr = pickle.load(f)
print(f"\ntest_matrix_csr loaded from {file_path}")

# Loading user_item_matrix_csr pickle file
file_path = '/content/collaborative_filtering/user_item_matrix_csr.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    user_item_matrix_csr = pickle.load(f)
print(f"\nuser_item_matrix_csr loaded from {file_path}")


Matrix factorization model loaded from /content/collaborative_filtering/matrix_factorization_model.pkl

movie_id_map loaded from /content/collaborative_filtering/movie_id_map.pkl

user_id_map loaded from /content/collaborative_filtering/user_id_map.pkl

train_matrix_csr loaded from /content/collaborative_filtering/train_matrix_csr.pkl

test_matrix_csr loaded from /content/collaborative_filtering/test_matrix_csr.pkl

user_item_matrix_csr loaded from /content/collaborative_filtering/user_item_matrix_csr.pkl


In [None]:
def get_cosine_similarity_userX_to_allUsers(userId, user_id_map, user_item_matrix_csr): # TODO: we need to remove the own user from its similarity array
  user_index = user_id_map[userId]
  target_user_vector = user_item_matrix_csr[user_index : user_index + 1]
  similarities = cosine_similarity(target_user_vector, user_item_matrix_csr)
  similarities = similarities.flatten()
  similarities[user_index] = -1

  return similarities

def get_top_X_similar_users(userId, user_id_map, user_item_matrix_csr, X):
  index_to_user_id_map = {index: userId for userId, index in user_id_map.items()}
  similarities = get_cosine_similarity_userX_to_allUsers(userId, user_id_map, user_item_matrix_csr)
  sorted_indices = np.argsort(similarities)[::-1]
  top_X_indices = sorted_indices[:X] # top_X_indices are the zero indexed indexes of the users within the key-value pairs (userId, zero indexed index)
  top_X_user_Ids = []
  for user_index in top_X_indices:
    top_X_user_Ids.append(index_to_user_id_map[user_index])
  return top_X_user_Ids

print(get_cosine_similarity_userX_to_allUsers(1, user_id_map, user_item_matrix_csr))
print(get_top_X_similar_users(1, user_id_map, user_item_matrix_csr, 10))

[-1.          0.04086293  0.06130627 ...  0.01936879  0.04149169
  0.07084903]
[88296, 144596, 77503, 140236, 81513, 10563, 97640, 87389, 161802, 67907]


In [None]:
# Store our user and movie abstract feature (latent) matrices
user_factors = matrix_factorization_model.user_factors # Shape is num_users by num_factors
movie_factors = matrix_factorization_model.item_factors # Shape is num_items by num_factors

print(f"Learned User Factors Shape: {user_factors.shape}")
print(f"Learned Movie Factors Shape: {movie_factors.shape}")

Learned User Factors Shape: (162541, 50)
Learned Movie Factors Shape: (59047, 50)


In [None]:
def evaluate_popularity_based_recommender(allUsers, allMovies, k):
  popularity_based_recommendation_list = popularity_based_recommendation(allMovies)
  allPredictions = {}
  for userId in allUsers:
    allPredictions[userId] = popularity_based_recommendation_list
  precision, recall, f1, NDCG, hit_rate, num_users = evaluate_model(allUsers, allPredictions, k)
  print("---Popularity Based Recommender---")
  print(f"precision: {precision}")
  print(f"recall: {recall}")
  print(f"f1-score: {f1}")
  print(f"NDCG: {NDCG}")
  print(f"hit rate: {hit_rate}")
  print(f"number of users: {num_users}")
  return precision, recall, f1, NDCG, hit_rate, num_users

def evaluate_weighted_popularity_based_recommender(allUsers, allMovies, threshold, k): # threshold is for the recommender
  weighted_list = weighted_rating_popularity_based_recommendation(allMovies, threshold)
  allPredictions = {}
  for userId in allUsers:
    allPredictions[userId] = weighted_list

  precision, recall, f1, NDCG, hit_rate, num_users = evaluate_model(allUsers, allPredictions, k)
  print("---Weighted Popularity Based Recommender---")
  print(f"precision: {precision}")
  print(f"recall: {recall}")
  print(f"f1-score: {f1}")
  print(f"NDCG: {NDCG}")
  print(f"hit rate: {hit_rate}")
  print(f"number of users: {num_users}")
  return precision, recall, f1, NDCG, hit_rate, num_users

def evaluate_cosine_similarity(sample_allUsers, allUsers, allMovies, user_id_map, user_item_matrix_csr, k):
  allPredictions = {}
  for userId in sample_allUsers:
    top_user = get_top_X_similar_users(userId, user_id_map, user_item_matrix_csr, 1)[0]
    allPredictions[userId] = allUsers[top_user].get_sorted_movies(4.0)

  precision, recall, f1, NDCG, hit_rate, num_users = evaluate_model(sample_allUsers, allPredictions, k)
  print("---Cosine Similarity Recommender---")
  print(f"precision: {precision}")
  print(f"recall: {recall}")
  print(f"f1-score: {f1}")
  print(f"NDCG: {NDCG}")
  print(f"hit rate: {hit_rate}")
  print(f"number of users: {num_users}")
  return precision, recall, f1, NDCG, hit_rate, num_users

def evaluate_matrix_factorization_model(sample_allUsers, user_id_map, movie_id_map, user_factors, movie_factors, allMovies, k):
  allPredictions = {}

  for userId in sample_allUsers:
    user_confidence_list = []
    user_movieId_list = []

    for movieId in allMovies:
      if movieId in movie_id_map:
        user_confidence_list.append(np.dot(user_factors[user_id_map[userId]], movie_factors[movie_id_map[movieId]]))
        user_movieId_list.append(movieId)

    combined = zip(user_confidence_list, user_movieId_list)
    sorted_pairs = sorted(combined, reverse=True)
    sorted_confidence_list, sorted_movieId_list = zip(*sorted_pairs)
    allPredictions[userId] = sorted_movieId_list

  precision, recall, f1, NDCG, hit_rate, num_users = evaluate_model(sample_allUsers, allPredictions, k)
  print("---Matrix Factorization Recommender---")
  print(f"precision: {precision}")
  print(f"recall: {recall}")
  print(f"f1-score: {f1}")
  print(f"NDCG: {NDCG}")
  print(f"hit rate: {hit_rate}")
  print(f"number of users: {num_users}")
  return precision, recall, f1, NDCG, hit_rate, num_users

In [None]:
threshold = 20 # Weighted popularity model uses this
k = 20
allUsers_rating_restricted = {} # Subsample of allUsers for quicker computation and ensuring the users in the sample have at least k ratings
for userId in allUsers:
  if len(allUsers[userId].get_sorted_movies(4.0)) >= k:
    allUsers_rating_restricted[userId] = allUsers[userId]

num_users_to_sample = 1000
# Ensure we don't try to sample more users than available
if num_users_to_sample > len(allUsers_rating_restricted):
    num_users_to_sample = len(allUsers_rating_restricted)
    print(f"Warning: Desired sample size ({num_users_to_sample}) is greater than available restricted users. Sampling all {num_users_to_sample} users.")

# Get the list of user IDs from the restricted set
restricted_user_ids = list(allUsers_rating_restricted.keys())

# Randomly select N user IDs from the restricted set
sampled_user_ids = random.sample(restricted_user_ids, num_users_to_sample)

# Reconstruct the 'sample_allUsers' dictionary containing only the sampled user objects
sample_allUsers = {userId: allUsers_rating_restricted[userId] for userId in sampled_user_ids}
print(sample_allUsers)

{71651: <__main__.User object at 0x7ab89a91a790>, 15139: <__main__.User object at 0x7ab8b1872710>, 153231: <__main__.User object at 0x7ab87a39c5d0>, 58744: <__main__.User object at 0x7ab89f5569d0>, 21381: <__main__.User object at 0x7ab8af1c3710>, 42385: <__main__.User object at 0x7ab8a7500d50>, 100707: <__main__.User object at 0x7ab88de69150>, 88887: <__main__.User object at 0x7ab894449510>, 81260: <__main__.User object at 0x7ab896f89150>, 101596: <__main__.User object at 0x7ab88d7aec50>, 30907: <__main__.User object at 0x7ab8ab88ab90>, 140866: <__main__.User object at 0x7ab87edde0d0>, 111502: <__main__.User object at 0x7ab889cc9090>, 66882: <__main__.User object at 0x7ab89c5149d0>, 64581: <__main__.User object at 0x7ab89d194b10>, 129122: <__main__.User object at 0x7ab8833d56d0>, 85819: <__main__.User object at 0x7ab895640190>, 70266: <__main__.User object at 0x7ab89b0f9790>, 79322: <__main__.User object at 0x7ab897c1fc10>, 15525: <__main__.User object at 0x7ab8b14ccc10>, 154146: <__ma

In [None]:
print(len(sample_allUsers))
print(sample_allUsers.keys())
for userId in sample_allUsers:
  print(sample_allUsers[userId].get_sorted_movies(4.0))

1000
dict_keys([71651, 15139, 153231, 58744, 21381, 42385, 100707, 88887, 81260, 101596, 30907, 140866, 111502, 66882, 64581, 129122, 85819, 70266, 79322, 15525, 154146, 10228, 137232, 126909, 120197, 6414, 101855, 69704, 113080, 150579, 34573, 108168, 159816, 57730, 112514, 27252, 92605, 90307, 8920, 112219, 107501, 80014, 152353, 101935, 130118, 121883, 15116, 83046, 55814, 18731, 71787, 81509, 42466, 94899, 53309, 111765, 150075, 127395, 105808, 153730, 65470, 51268, 94323, 162079, 53585, 45606, 142521, 58667, 150786, 77438, 138194, 33651, 77514, 35446, 134561, 95859, 48673, 2973, 141355, 76468, 55148, 132285, 117126, 137833, 3191, 110401, 7450, 26325, 52774, 61392, 398, 106610, 90107, 146792, 55924, 52246, 73588, 25421, 25304, 82551, 35338, 28641, 30745, 5567, 138388, 149009, 96047, 62257, 113157, 133768, 62424, 33009, 101940, 123894, 112204, 86141, 128600, 10507, 53816, 116052, 146686, 152159, 122791, 42492, 123829, 127881, 51602, 136823, 139000, 26249, 124003, 157998, 4957, 14610

In [None]:
evaluate_popularity_based_recommender(sample_allUsers, allMovies, k)
evaluate_weighted_popularity_based_recommender(sample_allUsers, allMovies, threshold, k)
evaluate_cosine_similarity(sample_allUsers, allUsers, allMovies, user_id_map, user_item_matrix_csr, k)
evaluate_matrix_factorization_model(sample_allUsers, user_id_map, movie_id_map, user_factors, movie_factors, allMovies, k)

---Popularity Based Recommender---
precision: 0.0
recall: 0.0
f1-score: 0.0
NDCG: 0.0
hit rate: 0
number of users: 1000
---Weighted Popularity Based Recommender---
precision: 0.13534999999999978
recall: 0.03934000491895977
f1-score: 0.054100287488269096
NDCG: 0.12819776412283698
hit rate: 790
number of users: 1000
---Cosine Similarity Recommender---
precision: 0.4635000000000002
recall: 0.1808938457766022
f1-score: 0.23593938434488718
NDCG: 0.47956873025321833
hit rate: 997
number of users: 1000
---Matrix Factorization Recommender---
precision: 0.6039
recall: 0.22302923237117717
f1-score: 0.29453959550428555
NDCG: 0.6400806950814482
hit rate: 999
number of users: 1000


(0.6039,
 0.22302923237117717,
 0.29453959550428555,
 0.6400806950814482,
 999,
 1000)