## **Movies Recommendation: Neighborhood-Based Collaborative filtering**

### **Import necessary library**

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import pairwise_distances
import math
from utilities import *
from collaborative_filtering import NeighborhoodCF, build_utility_matrix, find_nonzero_mean_ratings, normalize

In [2]:
# import pandas as pd

# part_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]

# # Read the first file
# data = pd.read_csv('resources/ratings/users_ratings_part1.csv', header=None)

# # Loop through other files
# for id in part_ids:
#     file_path = f'resources/ratings/users_ratings_part{id}.csv'
#     try:
#         tmp = pd.read_csv(file_path, on_bad_lines="skip", header=None)  # Skip problematic rows
#         data = pd.concat([data, tmp], ignore_index=True)
#     except Exception as e:
#         print(f"Error reading file {file_path}: {e}")

# data.columns =['UserID', 'MovieID', 'Rating']

data = pd.read_csv('resources/ratings/users_ratings_part1.csv', header=None)
data.columns =['UserID', 'MovieID', 'Rating']

In [3]:
data = data.sort_values(by=['UserID', 'MovieID'])
data = data.drop_duplicates()
data = data.reset_index()
data = data.drop(columns=['index'])

In [4]:
data.head()

Unnamed: 0,UserID,MovieID,Rating
0,9423,27977,8
1,9423,31381,9
2,9423,32138,10
3,9423,33467,10
4,9423,38650,10


In [5]:
data.tail()

Unnamed: 0,UserID,MovieID,Rating
54972,186180127,468569,10
54973,186180127,1255953,10
54974,186180127,9362722,10
54975,186180127,11329280,10
54976,186180127,15239678,10


In [6]:
data.shape

(54977, 3)

In [7]:
# Count the number of interactions for each movie
movie_counts =data['MovieID'].value_counts()

# Filter out movies appear < 10 times 
movies_to_keep = movie_counts[movie_counts >= 10].index
data = data[data['MovieID'].isin(movies_to_keep)]

# Count the number of interactions for each user
user_counts = data['UserID'].value_counts()

# Filter out users appear < 10 times 
users_to_keep = user_counts[user_counts >= 10].index 
data = data[data['UserID'].isin(users_to_keep)]

In [8]:
num_users = data.UserID.unique().shape[0]
num_movies = data.MovieID.unique().shape[0]
num_users, num_movies

(267, 910)

### **Split data**

In [9]:
def split_data(df, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    
    # Preprocessing Step: Remove movies with frequency = 1
    movie_counts = df['MovieID'].value_counts()
    movies_to_keep = movie_counts[movie_counts > 1].index
    df = df[df['MovieID'].isin(movies_to_keep)].reset_index(drop=True)
    
    # Initialize sets to keep track of test indices
    test_indices = set()
    
    # Ensure each user has at least one entry in test set
    for user_id, group in df.groupby('UserID'):
        if len(group) >= 1:
            test_count = max(1, int(len(group) * test_size))
            test_sample = np.random.choice(group.index, size=test_count, replace=False)
            test_indices.update(test_sample)
    
    # Ensure each movie has at least one entry in test set
    for movie_id, group in df.groupby('MovieID'):
        # Find indices not already in test set
        remaining_indices = list(set(group.index) - test_indices)
        if remaining_indices:
            test_sample = np.random.choice(remaining_indices, size=1, replace=False)
            test_indices.update(test_sample)
    
    # Create test and train datasets
    test_df = df.loc[list(test_indices)].reset_index(drop=True)
    train_df = df.drop(list(test_indices)).reset_index(drop=True)
    
    # Optional: Verify that both train and test have the same users and movies
    common_users = set(train_df['UserID']).intersection(set(test_df['UserID']))
    common_movies = set(train_df['MovieID']).intersection(set(test_df['MovieID']))
    
    train_df = train_df[train_df['UserID'].isin(common_users) & train_df['MovieID'].isin(common_movies)].reset_index(drop=True)
    test_df = test_df[test_df['UserID'].isin(common_users) & test_df['MovieID'].isin(common_movies)].reset_index(drop=True)
    
    return train_df, test_df

# Example usage:
# Assuming your dataframe is named `df` and has columns 'UserID' and 'MovieID'
# train, test = split_data(df, test_size=0.2, random_state=42)


In [10]:
train_data, test_data = split_data(data)

In [11]:
train_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,9423,27977,8
1,9423,31381,9
2,9423,32138,10
3,9423,33467,10
4,9423,38650,10


In [12]:
train_data.tail()

Unnamed: 0,UserID,MovieID,Rating
28376,186180127,172495,10
28377,186180127,407887,10
28378,186180127,468569,10
28379,186180127,9362722,10
28380,186180127,15239678,10


In [13]:
num_users = train_data.UserID.unique().shape[0]
num_movies = train_data.MovieID.unique().shape[0]
num_users, num_movies

(267, 910)

In [14]:
test_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,106825954,365748,1
1,9423,53125,9
2,9423,52618,8
3,9423,53291,7
4,106825954,372784,8


In [15]:
test_data.tail()

Unnamed: 0,UserID,MovieID,Rating
8134,106825954,209144,9
8135,106825954,234215,7
8136,106825954,245429,9
8137,106825954,268380,8
8138,106825954,290334,7


In [16]:
num_users = test_data.UserID.unique().shape[0]
num_movies = test_data.MovieID.unique().shape[0]
num_users, num_movies

(267, 910)

In [17]:
movies = train_data.MovieID.drop_duplicates()
movies = pd.DataFrame(movies, columns=['MovieID'])
movies = movies.sort_values('MovieID', ascending=True)
movies = movies[['MovieID']]
movies = movies.reset_index()

In [18]:
users = train_data.UserID.drop_duplicates()
users = pd.DataFrame(users, columns=['UserID'])
users = users.sort_values('UserID', ascending=True)
users = users[['UserID']]
users = users.reset_index()

### **Build Utility Matrix**

In [19]:
utility_matrix = build_utility_matrix(train_data)
utility_matrix.shape

(267, 910)

## **User-User CF**

### **User-User CF with cosine similarity**

In [20]:
UUCF_cosine = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20)

In [21]:
cosine_uu_predicted_ratings, length = UUCF_cosine.predict_ratings()
length

267

In [22]:
recommendation = UUCF_cosine.recommend(id=9423, predicted_ratings=cosine_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend movies for user 9423 and predicted ratings:')
recommendation

Recommend movies for user 9423 and predicted ratings:


[(47478, 8), (60196, 8), (71315, 8), (110912, 8), (245429, 8)]

In [23]:
# UUCF_cosine.print_recommendation(predicted_ratings=cosine_uu_predicted_ratings, movies=movies, users=users)

### **User-User CF with pearson correlation**

In [24]:
UUCF_pearson = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20, cosine=False)

In [25]:
pearson_uu_predicted_ratings, length = UUCF_pearson.predict_ratings()
length

  c /= stddev[:, None]
  c /= stddev[None, :]


267

In [26]:
recommendation = UUCF_pearson.recommend(id=108786, predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend movies for user 108786 and predicted ratings:')
recommendation

Recommend movies for user 108786 and predicted ratings:


[(33467, 8), (34583, 8), (50083, 8), (68646, 8), (78748, 8)]

In [27]:
# UUCF_pearson.print_recommendation(predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users)

## **Item-Item CF** 

### **Item-Item CF with cosine similarity**

In [28]:
IICF_cosine = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20, uu_cf=False)

In [29]:
cosine_ii_predicted_ratings, length = IICF_cosine.predict_ratings()
length

910

In [30]:
recommendation = IICF_cosine.recommend(id=27977, predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend users for movie 27977 and predicted ratings:')
recommendation

Recommend users for movie 27977 and predicted ratings:


[(24619523, 10),
 (36672583, 10),
 (61541609, 10),
 (65718654, 10),
 (127969012, 10)]

In [31]:
# IICF_cosine.print_recommendation(predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users)

### **Item-Item CF with pearson correlation**

In [32]:
IICF_pearson = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20, uu_cf=False, cosine=False)

In [33]:
pearson_ii_predicted_ratings, length = IICF_pearson.predict_ratings()
length  

910

In [34]:
recommendation = IICF_pearson.recommend(id=27977, predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend users for movie 27977 and predicted ratings:')
recommendation

Recommend users for movie 27977 and predicted ratings:


[(24619523, 10),
 (36672583, 10),
 (61541609, 10),
 (65718654, 10),
 (127969012, 10)]

In [35]:
# IICF_pearson.print_recommendation(predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users)