## **Movies Recommendation: Neighborhood-Based Collaborative filtering**

### **Import necessary library**

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import pairwise_distances
import math
from utilities import *
from neighborhood_based_cf import NeighborhoodCF, build_utility_matrix, find_nonzero_mean_ratings, normalize

In [2]:
# import pandas as pd

# part_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]

# # Read the first file
# data = pd.read_csv('resources/ratings/users_ratings_part1.csv', header=None)

# # Loop through other files
# for id in part_ids:
#     file_path = f'resources/ratings/users_ratings_part{id}.csv'
#     try:
#         tmp = pd.read_csv(file_path, on_bad_lines="skip", header=None)  # Skip problematic rows
#         data = pd.concat([data, tmp], ignore_index=True)
#     except Exception as e:
#         print(f"Error reading file {file_path}: {e}")

# data.columns =['UserID', 'MovieID', 'Rating']

data = pd.read_csv('resources/ratings/users_ratings_part1.csv', header=None)
data.columns =['UserID', 'MovieID', 'Rating']

In [3]:
data = data.sort_values(by=['UserID', 'MovieID'])
data = data.drop_duplicates()
data = data.reset_index()
data = data.drop(columns=['index'])

In [4]:
data.head()

Unnamed: 0,UserID,MovieID,Rating
0,9423,27977,8
1,9423,31381,9
2,9423,32138,10
3,9423,33467,10
4,9423,38650,10


In [5]:
data.tail()

Unnamed: 0,UserID,MovieID,Rating
54972,186180127,468569,10
54973,186180127,1255953,10
54974,186180127,9362722,10
54975,186180127,11329280,10
54976,186180127,15239678,10


In [6]:
data.shape

(54977, 3)

### **Split data**

In [7]:
def split_data(df, test_size=0.2, random_state=42): 
    np.random.seed(random_state)
    test_indices = []

    # Group by UserID to ensure each user has data in both train and test sets 
    for user_id, group in df.groupby('UserID'): 
        group_indices = group.index.tolist()
        test_count = max(1, int(len(group_indices) * test_size))
        test_indices.extend(np.random.choice(group_indices, size=test_count, replace=False))
    
    test_df = df.loc[test_indices].reset_index(drop=True)
    train_df = df.drop(test_indices).reset_index(drop=True)

    return train_df, test_df

In [8]:
train_data, test_data = split_data(data)

In [9]:
train_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,9423,27977,8
1,9423,31381,9
2,9423,32138,10
3,9423,33467,10
4,9423,38650,10


In [10]:
train_data.tail()

Unnamed: 0,UserID,MovieID,Rating
44043,186180127,407887,10
44044,186180127,468569,10
44045,186180127,9362722,10
44046,186180127,11329280,10
44047,186180127,15239678,10


In [11]:
train_data.shape

(44048, 3)

In [12]:
test_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,9423,118884,9
1,9423,50083,9
2,9423,105236,7
3,9423,91763,8
4,9423,110475,7


In [13]:
test_data.tail()

Unnamed: 0,UserID,MovieID,Rating
10924,186028499,1392214,10
10925,186180127,172495,10
10926,186180127,103064,10
10927,186180127,1255953,10
10928,186180127,21749,10


In [14]:
test_data.shape

(10929, 3)

In [15]:
movies = train_data.MovieID.drop_duplicates()
movies = pd.DataFrame(movies, columns=['MovieID'])
movies = movies.sort_values('MovieID', ascending=True)
movies = movies[['MovieID']]
movies = movies.reset_index()

In [16]:
movies

Unnamed: 0,index,MovieID
0,28251,417
1,32974,439
2,28252,1527
3,35244,4972
4,32036,8975
...,...,...
9591,35700,32330860
9592,18463,32368801
9593,43890,32375562
9594,27643,32832648


In [17]:
users = train_data.UserID.drop_duplicates()
users = pd.DataFrame(users, columns=['UserID'])
users = users.sort_values('UserID', ascending=True)
users = users[['UserID']]
users = users.reset_index()

In [18]:
users

Unnamed: 0,index,UserID
0,0,9423
1,200,108786
2,400,177737
3,600,183617
4,800,205065
...,...,...
327,43892,180784797
328,43895,181295995
329,43953,185494454
330,44021,186028499


### **Build Utility Matrix**

In [19]:
utility_matrix = build_utility_matrix(train_data)
utility_matrix.shape

(332, 9596)

## **User-User CF**

### **User-User CF with cosine similarity**

In [20]:
UUCF_cosine = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [21]:
cosine_uu_predicted_ratings = UUCF_cosine.predict_ratings()

In [22]:
recommendation = UUCF_cosine.recommend(id=9423, predicted_ratings=cosine_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend movies for user 1 and predicted ratings:')
recommendation

Recommend movies for user 1 and predicted ratings:


[(47478, 8), (50083, 8), (53125, 8), (60196, 8), (62622, 8)]

In [23]:
# UUCF_cosine.print_recommendation(predicted_ratings=cosine_uu_predicted_ratings, movies=movies, users=users)

### **User-User CF with pearson correlation**

In [24]:
UUCF_pearson = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20, cosine=False)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [25]:
pearson_uu_predicted_ratings = UUCF_pearson.predict_ratings()

In [26]:
recommendation = UUCF_pearson.recommend(id=108786, predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend movies for user 1 and predicted ratings:')
recommendation

Recommend movies for user 1 and predicted ratings:


[(33467, 8), (50083, 8), (71562, 8), (78748, 8), (99685, 8)]

In [27]:
# UUCF_pearson.print_recommendation(predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users)

## **Item-Item CF** 

### **Item-Item CF with cosine similarity**

In [28]:
IICF_cosine = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20, uu_cf=False)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [29]:
cosine_ii_predicted_ratings = IICF_cosine.predict_ratings()

In [30]:
recommendation = IICF_cosine.recommend(id=417, predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend users for movie 1 and predicted ratings:')
recommendation

Recommend users for movie 1 and predicted ratings:


[(139318840, 10),
 (139602492, 10),
 (147870463, 10),
 (171343739, 10),
 (186028499, 10)]

In [31]:
# IICF_cosine.print_recommendation(predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users)

### **Item-Item CF with pearson correlation**

In [32]:
IICF_pearson = NeighborhoodCF(utility_matrix=utility_matrix, k_neighbors=20, uu_cf=False, cosine=False)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [33]:
pearson_ii_predicted_ratings = IICF_pearson.predict_ratings()

In [34]:
recommendation = IICF_pearson.recommend(id=417, predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users, top_n=5)
print('Recommend users for movie 1 and predicted ratings:')
recommendation

Recommend users for movie 1 and predicted ratings:


[(139318840, 10),
 (139602492, 10),
 (147870463, 10),
 (171343739, 10),
 (186028499, 10)]

In [35]:
# IICF_pearson.print_recommendation(predicted_ratings=pearson_uu_predicted_ratings, movies=movies, users=users)