# Netflix Collaborative Recommender System

Using item-item collaborative filtering method it returns the rating of any user for any movie which they have not watched yet and based on the predicted rating number we can take action of wheater to recommend or not.

It consists of three main modules.

* Dataset: A module for loading the required .csv data and merging them.

* Algorithm: A module for implementing recommendation algorithm.

* Evaluation: Testing it for random users (movies).

In [30]:
pip install gdown

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Loading the required libraries

In [28]:
import math
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
# Importing install the package 'gdown' using the 'conda' package manager.
# !conda install -y gdown

In [32]:
# Download files from Google Drive using gdown

!gdown --id 1hxRFl0Z9QFvW6ecL5olvn8SLmA09d8aq
!gdown --id 1J2njpXmp8twgx_9LLqcNzp_PRX6IZbq5

Downloading...
From (uriginal): https://drive.google.com/uc?id=1hxRFl0Z9QFvW6ecL5olvn8SLmA09d8aq
From (redirected): https://drive.google.com/uc?id=1hxRFl0Z9QFvW6ecL5olvn8SLmA09d8aq&confirm=t&uuid=cf346163-30a2-4b8a-b9d7-137387db9bba
To: c:\Users\DELL\Downloads\Netflix_Dataset_Rating.csv

  0%|          | 0.00/249M [00:00<?, ?B/s]
  0%|          | 524k/249M [00:00<01:08, 3.64MB/s]
  1%|          | 1.57M/249M [00:00<00:37, 6.66MB/s]
  1%|          | 2.62M/249M [00:00<00:30, 8.01MB/s]
  1%|▏         | 3.67M/249M [00:00<00:28, 8.54MB/s]
  2%|▏         | 4.72M/249M [00:00<00:27, 8.90MB/s]
  2%|▏         | 5.77M/249M [00:00<00:27, 8.95MB/s]
  3%|▎         | 6.82M/249M [00:00<00:25, 9.36MB/s]
  3%|▎         | 7.86M/249M [00:00<00:25, 9.37MB/s]
  4%|▎         | 8.91M/249M [00:01<00:25, 9.46MB/s]
  4%|▍         | 9.96M/249M [00:01<00:24, 9.57MB/s]
  4%|▍         | 11.0M/249M [00:01<00:24, 9.54MB/s]
  5%|▍         | 12.1M/249M [00:01<00:24, 9.60MB/s]
  5%|▌         | 13.1M/249M [00:01<00:24, 9.5

In [17]:
df_rating = pd.read_csv('c:/Users/DELL/Downloads/Netflix_Dataset_Rating.csv')
len(df_rating)

17337458

In [19]:
df_movie = pd.read_csv('c:/Users/DELL/Downloads/Netflix_Dataset_Movie.csv')
len(df_movie)

17770

In [20]:
# Merging into one dataframe for building a Recommendation System
merged_df = pd.merge(df_rating, df_movie, on='Movie_ID')
merged_df.head()

Unnamed: 0,User_ID,Rating,Movie_ID,Year,Name
0,712664,5,3,1997,Character
1,1331154,4,3,1997,Character
2,2632461,3,3,1997,Character
3,44937,5,3,1997,Character
4,656399,4,3,1997,Character


In [21]:
merged_df.shape

(17337458, 5)

In [22]:
# Creating a user-item utility matrix 
utility_matrix = merged_df.pivot_table(index='User_ID', columns='Movie_ID', values='Rating')

# Total number of unique users and 
print(utility_matrix.shape)

(143458, 1350)


In [23]:
utility_matrix

Movie_ID,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,3.0,,,...,3.0,,,,,,,,,
7,,5.0,,,,,4.0,5.0,,,...,3.0,,,5.0,,,,,,
79,,,,,,,,3.0,,,...,4.0,,,,,,4.0,,,
97,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,,5.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649370,,,,,,,,,,,...,,,,,,,,,,
2649378,,,,,,,3.0,3.0,,,...,,,,,,,,,,
2649388,,,,,,,,3.0,,,...,3.0,,,3.0,,3.0,,,,
2649426,,,,4.0,,,4.0,4.0,,,...,,,,,,,,,,


In [24]:
# Imputing the null entries by 0 for ease in calculations
utility_matrix_filled = utility_matrix.fillna(0)
utility_matrix_filled

Movie_ID,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,5.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,...,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649378,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2649388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,3.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0
2649426,0.0,0.0,0.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# It returns pairwise cosine similarity scores 
def calculate_movie_similarity(matrix, movie_id):
    if movie_id not in matrix.columns:
        return f"Movie ID {movie_id} not found in the utility matrix."
    
    movie_column = matrix[movie_id]
    similarity_scores = cosine_similarity([movie_column.values], matrix.T.values)
    similarity_scores = similarity_scores[0]
    
    similarity_dict = {}
    for i, score in enumerate(similarity_scores):
        if i == movie_column.name or score == 0:
            continue
        similarity_dict[matrix.columns[i]] = round(score, 6)
    
    return similarity_dict

# Example usage
movie_id = 3
similarity_scores = calculate_movie_similarity(utility_matrix_filled, movie_id)
print(similarity_scores)

{3: 1.0, 8: 0.051845, 16: 0.019816, 18: 0.07973, 26: 0.018302, 28: 0.03405, 30: 0.073136, 32: 0.009928, 33: 0.013621, 44: 0.056888, 45: 0.024461, 46: 0.018762, 47: 0.067278, 48: 0.013927, 52: 0.070662, 55: 0.025136, 56: 0.092463, 57: 0.088239, 58: 0.020741, 68: 0.010233, 76: 0.022929, 77: 0.023557, 78: 0.016215, 79: 0.078845, 83: 0.0744, 84: 0.01698, 97: 0.16269, 104: 0.025748, 108: 0.061476, 110: 0.080922, 111: 0.022162, 118: 0.026729, 122: 0.02201, 127: 0.023314, 133: 0.007406, 138: 0.013472, 143: 0.060706, 148: 0.023748, 152: 0.032593, 156: 0.093591, 165: 0.030643, 166: 0.046179, 167: 0.112365, 171: 0.044105, 173: 0.026196, 175: 0.080787, 178: 0.045113, 180: 0.039852, 181: 0.031737, 185: 0.015357, 187: 0.039078, 188: 0.016262, 189: 0.026927, 191: 0.051105, 197: 0.038376, 199: 0.095107, 201: 0.052175, 208: 0.017178, 209: 0.012365, 213: 0.089426, 215: 0.015737, 216: 0.032572, 223: 0.028885, 225: 0.008545, 232: 0.026415, 238: 0.023404, 239: 0.007957, 240: 0.053707, 241: 0.096902, 242: 

In [27]:

def calculate_weighted_rating(matrix, user_id, movie_id, n):
    if movie_id not in matrix.columns:
        return f"Movie ID {movie_id} not found in the utility matrix."
    
    rating = matrix.loc[user_id, movie_id]
    
    if rating == 0:
        similarity_scores = calculate_movie_similarity(matrix, movie_id)
        top_movies = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:n]
        
        p_values = [score for _, score in top_movies]
        a_values = [movie for movie, _ in top_movies]
        
        valid_movies = all(movie in matrix.columns for movie in a_values)
        if valid_movies:
            r = sum(p * matrix.loc[user_id, a] for p, a in zip(p_values, a_values)) / sum(p_values)
            
            matrix.loc[user_id, movie_id] = r
            return math.ceil(r)
        else:
            return "Unable to calculate weighted rating due to missing movie data."
    
    else:
        return rating

# Example usage
user_id = 1427
movie_id = 83
N = 4
weighted_rating = calculate_weighted_rating(utility_matrix_filled, user_id, movie_id, N)

if isinstance(weighted_rating, str):
    print(weighted_rating)
else:
    print(f"The user with User_ID = {user_id} would rate Movie_ID = {movie_id} as {math.ceil(weighted_rating)}.")
        
'''Here based on the user 1427's ratings on the set of movies they have already watched,
 this would be rated as 1 on the scale of 1 to 5 () by them as per this recommendation algo'''

The user with User_ID = 1427 would rate Movie_ID = 83 as 3.


"Here based on the user 1427's ratings on the set of movies they have already watched,\n this would be rated as 1 on the scale of 1 to 5 () by them as per this recommendation algo"