## **Movies Recommendation: Matrix Factorized Collaborative Filtering**

### **Import necessary library**

In [1]:
import pandas as pd 
import numpy as np 
import math
from collaborative_filtering import MatrixFactorizationCF, build_utility_matrix


In [2]:
import pandas as pd

part_ids = range(2, 11)

# Read the first file
data = pd.read_csv('resources/data/split_ratings/ratings_part_1.txt', header=None)

# Loop through other files
for id in part_ids:
    file_path = f'resources/data/split_ratings/ratings_part_{id}.txt'
    try:
        tmp = pd.read_csv(file_path, on_bad_lines="skip", header=None)  # Skip problematic rows
        data = pd.concat([data, tmp], ignore_index=True)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

data.columns =['UserID', 'MovieID', 'Rating']

# data = pd.read_csv('resources/data/split_ratings/ratings_part_1.txt', header=None)
# data.columns =['UserID', 'MovieID', 'Rating']

In [3]:
data = data.sort_values(by=['UserID', 'MovieID'])
data = data.drop_duplicates()
data = data.reset_index()
data = data.drop(columns=['index'])

In [5]:
data.head()

Unnamed: 0,UserID,MovieID,Rating
0,66,68646,10
1,66,71562,10
2,66,76759,10
3,66,80684,10
4,66,88763,8


In [6]:
data.tail()

Unnamed: 0,UserID,MovieID,Rating
4681300,190007800,8115900,7
4681301,190007800,10872600,10
4681302,190007800,12412888,9
4681303,190007800,19623240,1
4681304,190007800,21235248,8


In [7]:
# Count the number of interactions for each movie
movie_counts =data['MovieID'].value_counts()

# Filter out movies appear < 10 times 
movies_to_keep = movie_counts[movie_counts >= 10].index
data = data[data['MovieID'].isin(movies_to_keep)]

# Count the number of interactions for each user
user_counts = data['UserID'].value_counts()

# Filter out users appear < 10 times 
users_to_keep = user_counts[user_counts >= 10].index 
data = data[data['UserID'].isin(users_to_keep)]

In [8]:
num_users = data.UserID.unique().shape[0]
num_movies = data.MovieID.unique().shape[0]
num_users, num_movies

(11386, 9816)

### **Split data**

In [10]:
def split_data(df, test_size=0.1, random_state=42):
    np.random.seed(random_state)
    
    # Preprocessing Step: Remove movies with frequency = 1
    movie_counts = df['MovieID'].value_counts()
    movies_to_keep = movie_counts[movie_counts > 1].index
    df = df[df['MovieID'].isin(movies_to_keep)].reset_index(drop=True)
    
    # Initialize sets to keep track of test indices
    test_indices = set()
    
    # Ensure each user has at least one entry in test set
    for user_id, group in df.groupby('UserID'):
        if len(group) >= 1:
            test_count = max(1, int(len(group) * test_size))
            test_sample = np.random.choice(group.index, size=test_count, replace=False)
            test_indices.update(test_sample)
    
    # Ensure each movie has at least one entry in test set
    for movie_id, group in df.groupby('MovieID'):
        # Find indices not already in test set
        remaining_indices = list(set(group.index) - test_indices)
        if remaining_indices:
            test_sample = np.random.choice(remaining_indices, size=1, replace=False)
            test_indices.update(test_sample)
    
    # Create test and train datasets
    test_df = df.loc[list(test_indices)].reset_index(drop=True)
    train_df = df.drop(list(test_indices)).reset_index(drop=True)
    
    # Optional: Verify that both train and test have the same users and movies
    common_users = set(train_df['UserID']).intersection(set(test_df['UserID']))
    common_movies = set(train_df['MovieID']).intersection(set(test_df['MovieID']))
    
    train_df = train_df[train_df['UserID'].isin(common_users) & train_df['MovieID'].isin(common_movies)].reset_index(drop=True)
    test_df = test_df[test_df['UserID'].isin(common_users) & test_df['MovieID'].isin(common_movies)].reset_index(drop=True)
    
    return train_df, test_df

# Example usage:
# Assuming your dataframe is named `df` and has columns 'UserID' and 'MovieID'
# train, test = split_data(df, test_size=0.2, random_state=42)


In [11]:
train_data, test_data = split_data(data)

In [12]:
train_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,66,68646,10
1,66,71562,10
2,66,76759,10
3,66,80684,10
4,66,88763,8


In [13]:
train_data.tail()

Unnamed: 0,UserID,MovieID,Rating
4207355,190007800,8115900,7
4207356,190007800,10872600,10
4207357,190007800,12412888,9
4207358,190007800,19623240,1
4207359,190007800,21235248,8


In [14]:
num_users = train_data.UserID.unique().shape[0]
num_movies = train_data.MovieID.unique().shape[0]
num_users, num_movies

(11386, 9816)

In [15]:
test_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,5897445,8740790,6
1,66,120689,10
2,23655320,6499752,7
3,23655320,6628394,8
4,23655320,6663582,6


In [16]:
test_data.tail()

Unnamed: 0,UserID,MovieID,Rating
473401,44631882,42876,10
473402,87777106,4972582,10
473403,5897445,7653254,8
473404,87777106,6146586,9
473405,87777106,6802308,3


In [17]:
num_users = test_data.UserID.unique().shape[0]
num_movies = test_data.MovieID.unique().shape[0]
num_users, num_movies

(11386, 9816)

In [18]:
movies = train_data.MovieID.drop_duplicates()
movies = pd.DataFrame(movies, columns=['MovieID'])
movies = movies.sort_values('MovieID', ascending=True)
movies = movies[['MovieID']]
movies = movies.reset_index()

In [19]:
users = train_data.UserID.drop_duplicates()
users = pd.DataFrame(users, columns=['UserID'])
users = users.sort_values('UserID', ascending=True)
users = users[['UserID']]
users = users.reset_index()

### **Build Utility Matrix**

In [20]:
utility_matrix = build_utility_matrix(train_data)
utility_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [8., 8., 7., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 6., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## **User-based MF**

In [21]:
UUMF = MatrixFactorizationCF(R=utility_matrix, K=45, learning_rate=0.005, epochs=50, regularization=0.1)
UUMF.train()

Epoch: 1 - RMSE: 1.7373
Epoch: 2 - RMSE: 1.6575
Epoch: 3 - RMSE: 1.6456
Epoch: 4 - RMSE: 1.6339
Epoch: 5 - RMSE: 1.6188
Epoch: 6 - RMSE: 1.6100
Epoch: 7 - RMSE: 1.6032
Epoch: 8 - RMSE: 1.5961
Epoch: 9 - RMSE: 1.5899
Epoch: 10 - RMSE: 1.5849
Epoch: 11 - RMSE: 1.5809
Epoch: 12 - RMSE: 1.5774
Epoch: 13 - RMSE: 1.5744
Epoch: 14 - RMSE: 1.5717
Epoch: 15 - RMSE: 1.5694
Epoch: 16 - RMSE: 1.5672
Epoch: 17 - RMSE: 1.5652
Epoch: 18 - RMSE: 1.5633
Epoch: 19 - RMSE: 1.5616
Epoch: 20 - RMSE: 1.5599
Epoch: 21 - RMSE: 1.5583
Epoch: 22 - RMSE: 1.5568
Epoch: 23 - RMSE: 1.5554
Epoch: 24 - RMSE: 1.5540
Epoch: 25 - RMSE: 1.5527
Epoch: 26 - RMSE: 1.5514
Epoch: 27 - RMSE: 1.5503
Epoch: 28 - RMSE: 1.5491
Epoch: 29 - RMSE: 1.5480
Epoch: 30 - RMSE: 1.5470
Epoch: 31 - RMSE: 1.5460
Epoch: 32 - RMSE: 1.5450
Epoch: 33 - RMSE: 1.5441
Epoch: 34 - RMSE: 1.5432
Epoch: 35 - RMSE: 1.5424
Epoch: 36 - RMSE: 1.5416
Epoch: 37 - RMSE: 1.5408
Epoch: 38 - RMSE: 1.5401
Epoch: 39 - RMSE: 1.5394
Epoch: 40 - RMSE: 1.5387
Epoch: 41

KeyboardInterrupt: 

In [20]:
predicted_R = UUMF.full_prediction()

In [None]:
recommendation = UUMF.recommend(id=9423, predicted_R=predicted_R, movies=movies, users=users, top_n=5)
print('Recommend movies for user 9423 and predicted ratings:')
recommendation

In [22]:
# UUMF.print_recommendation(predicted_R=predicted_R, movies=movies, users=users)

## **Item-based MF**

In [None]:
IIMF = MatrixFactorizationCF(R=utility_matrix, K=45, learning_rate=0.005, epochs=50, regularization=0.1, uu_mf=False)
IIMF.train()

In [24]:
predicted_R = IIMF.full_prediction()

In [None]:
predicted_R[predicted_R > 10]

In [None]:
recommendation = IIMF.recommend(id=27977, predicted_R=predicted_R, movies=movies, users=users, top_n=5)
print('Recommend users for movie 27977 and predicted ratings:')
recommendation

In [27]:
# IIMF.print_recommendation(predicted_R=predicted_R, movies=movies, users=users)