<a href="https://colab.research.google.com/github/yckamra/MovieLens25m-Recommender/blob/main/Benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Installing project dependencies...")
!pip install numpy pandas scipy scikit-learn # Basic libraries we want
!pip install implicit # Installs implicit library for matrix factorization
#!pip install torch
print("Dependencies installed successfully.")

Installing project dependencies...
Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Dependencies installed successfully.


In [2]:
import math
from google.colab import auth
from google.cloud import storage
import zipfile
import os
import pickle
import pandas as pd
import numpy as np
import implicit
import json

In [3]:
def precision_k(predictionList, actualList, k): # predictionList and actualList is List of movieIds

  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  # Resolves division by zero
  if k <= 0:
    return 0

  collision_map = {}
  similar_movies = 0

  for movie in noDupesPredictionList: # Puts all movie predictions in a dictionary
    if movie not in collision_map:
      collision_map[movie] = movie

  for movie in noDupesActualList:
    if movie in collision_map:
      similar_movies += 1 # Increments movies in both lists when there is a collision

  precision_k = similar_movies / k

  return precision_k

In [4]:
def recall_k(predictionList, actualList, k):

  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  # Resolves division by zero
  if len(noDupesActualList) == 0:
    return 0

  collision_map = {}
  similar_movies = 0

  for movie in noDupesPredictionList: # Puts all movie predictions in a dictionary
    if movie not in collision_map:
      collision_map[movie] = movie

  for movie in noDupesActualList:
    if movie in collision_map:
      similar_movies += 1 # Increments movies in both lists when there is a collision

  recall_k = similar_movies / len(noDupesActualList)

  return recall_k

In [5]:
def f1_k(predictionList, actualList, k):

  precision = precision_k(predictionList, actualList, k)
  recall = recall_k(predictionList, actualList, k)
  f1_score = 2 * ((precision * recall) / (precision + recall))

  return f1_score

In [6]:
def DCG_k(predictionList, actualList, k): # This form of DCG uses binary relevance (1 if rated and 0 if not), and could use ratings for improved rankings
  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  sum_of_DCG = 0

  if k > len(noDupesPredictionList):
    k = len(noDupesPredictionList)

  for i in range(1,k+1): # If k = 5, then i = 1, 2, 3, 4, 5
    index_in_list = i - 1
    curr_movie = noDupesPredictionList[index_in_list]
    relevance = 0
    if curr_movie in noDupesActualList:
      relevance = 1
    else:
      relevance = 0
    DCG = relevance / (math.log2(i+1))
    sum_of_DCG += DCG

  return sum_of_DCG

In [7]:
def IDCG_k(predictionList, actualList, k):

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  sum_of_IDCG = 0

  if k > len(noDupesActualList):
    k = len(noDupesActualList)

  for i in range(1,k+1): # If k = 5, then i = 1, 2, 3, 4, 5
    index_in_list = i - 1
    relevance = 1
    IDCG = relevance / (math.log2(i+1))
    sum_of_IDCG += IDCG

  return sum_of_IDCG

In [8]:
def NDCG_k(predictionList, actualList, k): # Normalized Discounted Cumulative Gain

  DCG = DCG_k(predictionList, actualList, k)
  IDCG = IDCG_k(predictionList, actualList, k)

  if IDCG == 0:
    return 0

  NDCG = DCG / IDCG

  return NDCG

In [9]:
def contains_relevant_item(predictionList, actualList, k):
  predictionDict = dict.fromkeys(predictionList) # Remove duplicates in predictionList without losing order
  noDupesPredictionList = list(predictionDict.keys())[:k] # Create list minus the duplicates

  actualDict = dict.fromkeys(actualList) # Remove duplicates in actualList without losing order
  noDupesActualList = list(actualDict.keys()) # Create list minus the duplicates

  collision_map = {}

  for movie in noDupesPredictionList: # Puts all movie predictions in a dictionary
    if movie not in collision_map:
      collision_map[movie] = movie

  for movie in noDupesActualList:
    if movie in collision_map:
      return True

  return False

In [10]:
def mean_precision_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs and allPredictions is a dictionary with (userId, predictionList)
  total_users = 0
  sum_of_precisions = 0

  for user in allUsers: # user is a userID so we can use it for both allUsers and allPredictions
    total_users += 1
    sum_of_precisions += precision_k(allPredictions[user], allUsers[user].movies, k)

  if total_users == 0:
    return 0

  mean_precision = sum_of_precisions / total_users

  return mean_precision

In [11]:
def mean_recall_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  total_users = 0
  sum_of_recalls = 0

  for user in allUsers:
    total_users += 1
    sum_of_recalls += recall_k(allPredictions[user], allUsers[user].movies, k)

  if total_users == 0:
    return 0

  mean_recall = sum_of_recalls / total_users

  return mean_recall

In [12]:
def mean_f1_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  total_users = 0
  sum_of_f1 = 0

  for user in allUsers:
    total_users += 1
    sum_of_f1 += f1_k(allPredictions[user], allUsers[user].movies, k)

  if total_users == 0:
    return 0

  mean_f1 = sum_of_f1 / total_users

  return mean_f1

In [13]:
def mean_NDCG_k(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  total_users = 0
  sum_of_NDCG = 0

  for user in allUsers:
    total_users += 1
    sum_of_NDCG += NDCG_k(allPredictions[user], allUsers[user].movies, k)

  if total_users == 0:
    return 0

  mean_NDCG = sum_of_NDCG / total_users

  return mean_NDCG

In [14]:
def total_hit_rate(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs
  hit_rate = 0
  num_users = 0

  for user in allUsers:
    if contains_relevant_item(allPredictions[user], allUsers[user].movies, k):
      hit_rate += 1
    num_users += 1

  return hit_rate, num_users

In [15]:
def evaluate_model(allUsers, allPredictions, k): # allUsers is a dictionary with (userId, userObject) key-value pairs and predictions is a 2D matrix

  precision = mean_precision_k(allUsers, allPredictions, k)
  recall = mean_recall_k(allUsers, allPredictions, k)
  f1 = mean_f1_k(allUsers, allPredictions, k)
  NDCG = mean_NDCG_k(allUsers, allPredictions, k)
  hit_rate, num_users = total_hit_rate(allUsers, allPredictions, k)

  return precision, recall, f1, NDCG, hit_rate, num_users

In [16]:
auth.authenticate_user()

!gcloud config set project movielens-recommender-451017

!gsutil cp gs://movielens-data/allMovies_data.pkl /content/
!gsutil cp gs://movielens-data/allUsers_data.pkl /content/

Updated property [core/project].
Copying gs://movielens-data/allMovies_data.pkl...
/ [1 files][ 19.0 MiB/ 19.0 MiB]                                                
Operation completed over 1 objects/19.0 MiB.                                     
Copying gs://movielens-data/allUsers_data.pkl...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1 files][995.9 MiB/995.9 MiB]   40.1 MiB/s                                   
Operation completed over 1 objects/995.9 MiB.                                    


In [17]:
!ls -lh /content/

total 1015M
-rw-r--r-- 1 root root  19M Jun 17 16:55 allMovies_data.pkl
-rw-r--r-- 1 root root 996M Jun 17 16:55 allUsers_data.pkl
drwxr-xr-x 1 root root 4.0K Jun 16 13:32 sample_data


In [18]:
class Movie:
  # Items in self:
  #     ID
  #     name
  #     genre_list
  #     tag_list
  #     number_of_ratings -- total amount this movies has been rated
  #     sum_of_ratings -- summation of all the ratings from all users for this movie
  #     year
  #     imdbId
  #     tmdbId
  #     actors_list
  #     tmdb_tag_list

  def __init__(self):
    self.ID = None
    self.title = None
    self.genre_list = []
    self.tag_list = []
    self.number_of_ratings = 0
    self.sum_of_ratings = 0
    self.year = None
    self.imdbId = None
    self.tmdbId = None
    self.actors_list = []
    self.tmdb_tag_list = []

  def set_tmdb_tag_list(self, tags):
    self.tmdb_tag_list = tags

  def add_to_tmdb_tag_list(self, tags):
    self.tmdb_tag_list = self.tmdb_tag_list + tags

  def get_tmdb_tag_list(self):
    return self.tmdb_tag_list

  def set_actors_list(self, actors_list):
    self.actors_list = actors_list

  def add_actor_to_actors_list(self, actor):
    self.actors_list.append(actor)

  def get_actors_list(self):
    return self.actors_list

  def set_imdbId(self, imbdId):
    self.imbdId = imbdId

  def get_imdbId(self):
    return self.imdbId

  def set_tmdbId(self, tmdbId):
    self.tmdbId = tmdbId

  def get_tmdbId(self):
    return self.tmdbId

  def set_year(self, year):
    self.year = year

  def get_year(self):
    return self.year

  def get_tag_list(self):
    return self.tag_list

  def add_tag_to_tag_list(self, tag):
    self.tag_list.append(tag)

  def get_ID(self):
    return self.ID

  def set_ID(self, ID):
    self.ID = ID

  def set_title(self, title):
    self.title = title

  def get_title(self):
    return self.title

  def set_genre_list(self, genre_list):
    self.genre_list = genre_list

  def add_genre_to_genre_list(self, genre):
    self.genre_list.append(genre)

  def get_genre_list(self):
    return self.genre_list

  def get_number_of_ratings(self):
    return self.number_of_ratings

  def set_number_of_ratings(self, number):
    self.number_of_ratings = number

  def add_to_number_of_ratings(self):
    self.number_of_ratings += 1

  def get_sum_of_ratings(self):
    return self.sum_of_ratings

  def set_sum_of_ratings(self, number):
    self.sum_of_ratings = number

  def add_rating_to_sum_of_ratings(self, rating):
    self.sum_of_ratings += rating

  def calculate_average_rating(self):
    if self.number_of_ratings == 0:
      return 0
    else:
      return self.sum_of_ratings / self.number_of_ratings

In [19]:
class User:
  # ---Items in self---
  # userId : ID in MovieLens 25m dataset
  # ratings_df : pandas dataframe containing the users rows within the MovieLens 25m ratings.csv

  def __init__(self, userID=None, ratings_df=None):
    self.userID = userID
    self.ratings_df = ratings_df if ratings_df is not None else pd.DataFrame()

  def get_userID(self):
    return self.userID

  def set_userID(self, ID : int):
    assert isinstance(ID, int), "ID must be an integer"
    self.userID = int(ID)

  def add_row_to_ratings_df(self, row):

    if row["movieId"] in self.ratings_df["movieId"].values:
      return False
    else:
      self.ratings_df = pd.concat([self.ratings_df, pd.DataFrame([row])], ignore_index=True)

    return

In [20]:
# Loading pickle file
file_path = '/content/allMovies_data.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    allMovies = pickle.load(f)
print(f"\nMovie dictionary loaded from {file_path}")


Movie dictionary loaded from /content/allMovies_data.pkl


In [21]:
# Verify loaded data
print(f"Loaded movie '1' title: {allMovies[1].get_title()}")
print(f"Loaded movie '1' genre list: {allMovies[1].get_genre_list()}")
print(f"Loaded movie '1' tags: {allMovies[1].get_tag_list()}")

Loaded movie '1' title: Toy Story (1995)
Loaded movie '1' genre list: ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']
Loaded movie '1' tags: ['Owned', 'imdb top 250', 'Pixar', 'Pixar', 'time travel', 'children', 'comedy', 'funny', 'witty', 'rated-G', 'animation', 'Pixar', 'computer animation', 'good cartoon chindren', 'pixar', 'friendship', 'bright', 'DARING RESCUES', 'fanciful', 'HEROIC MISSION', 'humorous', 'light', 'rousing', 'TOYS COME TO LIFE', 'UNLIKELY FRIENDSHIPS', 'warm', 'witty', 'animation', 'Disney', 'friendship', 'pixar', 'boy', 'boy next door', 'bullying', 'friends', 'friendship', 'jealousy', 'martial arts', 'mission', 'neighborhood', 'new toy', 'pixar', 'rescue', 'resourcefulness', 'rivalry', 'toy', 'toy comes to life', 'walkie talkie', 'clever', 'animation', 'clever', 'friendship', 'funny', 'Tom Hanks', 'witty', 'pixar', 'Pixar', 'toys', 'animation', 'fun', 'children', 'pixar', 'Tom Hanks', 'dolls', 'National Film Registry', 'adventure', 'animated', 'animati

In [25]:
# Loading pickle file
file_path = '/content/allUsers_data.pkl'
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    allUsers = pickle.load(f)
print(f"\nUser dictionary loaded from {file_path}")


User dictionary loaded from /content/allUsers_data.pkl


In [26]:
# Verify loaded data
print(f"Loaded user '1' ratings_df: {allUsers[1].ratings_df.head()}")

Loaded movie '1' ratings_df:    userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [None]:
def popularity_based_recommendation(movie_dictionary):
  list_of_movies = []
  list_of_popularity = []
  for key in movie_dictionary:
    list_of_movies.append(movie_dictionary[key].get_ID())
    list_of_popularity.append(movie_dictionary[key].calculate_average_rating())
  combined = zip(list_of_popularity, list_of_movies)
  sorted_pairs = sorted(combined, reverse=True)
  sorted_popularity, sorted_movies = zip(*sorted_pairs)
  return sorted_movies # Returns the movie IDs in descending order (most popular first)

def weighted_rating_popularity_based_recommendation(movie_dictionary, threshold): # Weighted Rating: WR = (v/(v+m))R + (m/(v+m))C, threshold should be ratings_df.groupby('movieId').size().quantile(0.80) so higher threshold is typically advised (ie. 20)
  number_of_ratings = 0
  sum_of_ratings = 0
  for key in movie_dictionary:
    if movie_dictionary[key].get_number_of_ratings() != 0:
      number_of_ratings += movie_dictionary[key].get_number_of_ratings()
      sum_of_ratings += movie_dictionary[key].get_sum_of_ratings()

  global_mean_rating = sum_of_ratings / number_of_ratings
  list_of_movies = []
  list_of_weighted_popularity = []

  for key in movie_dictionary:
    list_of_movies.append(movie_dictionary[key].get_ID()) # Add movie ID to list in order to return back to user
    curr_movie_num_ratings = movie_dictionary[key].get_number_of_ratings() # Get the current movie's number of ratings
    min_num_of_votes_threshold = threshold # The number of ratings a movie needs to be considered and not just averaged out
    curr_movie_average_rating = movie_dictionary[key].calculate_average_rating()
    weighted_rating = ((curr_movie_num_ratings / (curr_movie_num_ratings + min_num_of_votes_threshold)) * curr_movie_average_rating) + ((min_num_of_votes_threshold / (curr_movie_num_ratings + min_num_of_votes_threshold)) * global_mean_rating)
    list_of_weighted_popularity.append(weighted_rating)

  combined = zip(list_of_weighted_popularity, list_of_movies)
  sorted_pairs = sorted(combined, reverse=True)
  sorted_weighted_popularity, sorted_movies = zip(*sorted_pairs)

  return sorted_movies # Returns the movie IDs in descending order (most popular first)

In [None]:
print("-----Popularity Model-----")
movies_recommended = popularity_based_recommendation(allMovies)
sneak_peak_popularity = movies_recommended[:10]
for ID in sneak_peak_popularity:
  print(f"title: {allMovies[ID].get_title()}")
  print(f"number of ratings: {allMovies[ID].get_number_of_ratings()}")
  print(f"average rating: {allMovies[ID].calculate_average_rating()}\n")

print("-----Weighted Popularity Model-----")
movies_recommended_weighted = weighted_rating_popularity_based_recommendation(allMovies, 20)
sneak_peak_weighted_popularity = movies_recommended_weighted[:10]
for ID in sneak_peak_weighted_popularity:
  print(f"title: {allMovies[ID].get_title()}")
  print(f"number of ratings: {allMovies[ID].get_number_of_ratings()}")
  print(f"average rating: {allMovies[ID].calculate_average_rating()}\n")

-----Popularity Model-----
title: Santosh Subramaniam (2008)
number of ratings: 1
average rating: 5.0

title: Screen Play (1992)
number of ratings: 1
average rating: 5.0

title: Zana (2019)
number of ratings: 1
average rating: 5.0

title: Punk the Capital: Building a Sound Movement
number of ratings: 1
average rating: 5.0

title: Kaithi (2019)
number of ratings: 2
average rating: 5.0

title: War (2019)
number of ratings: 1
average rating: 5.0

title: Rolli and the Golden Key (2013)
number of ratings: 1
average rating: 5.0

title: The Head (2004)
number of ratings: 1
average rating: 5.0

title: Everything Is Terrible! Presents: The Great Satan (2017)
number of ratings: 1
average rating: 5.0

title: The Fallen of World War II (2015)
number of ratings: 1
average rating: 5.0

-----Weighted Popularity Model-----
title: Planet Earth II (2016)
number of ratings: 1124
average rating: 4.483096085409253

title: Planet Earth (2006)
number of ratings: 1747
average rating: 4.464796794504865

title: