# Use matrix factorization for recommender system
## Dependencies

In [3]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix

## Hyper parametter

In [17]:
K=60 # latent factors
lam=0.02 # regularization
learning_rate=0.001 # learning rate
max_iter=20 # max iterations
print_every=1 # print loss for each iteration
batch_size=1000 # batch size
tolerance=1e-6 # tolerance


## Create class MF

In [12]:
import numpy as np

class MFOptimized:
    def __init__(self, Y, K, lam=0.1, learning_rate=0.01, max_iter=100, print_every=10, batch_size=1000, tolerance=1e-6):
        """
        Initialize the MF model.
        Y: numpy array, shape (n_ratings, 3), each row [user_id, item_id, rating].
        K: Number of latent factors.
        lam: Regularization parameter.
        learning_rate: Learning rate for gradient descent.
        max_iter: Number of training iterations.
        print_every: Print loss every `print_every` iterations.
        batch_size: Size of mini-batches for SGD.
        tolerance: Tolerance for convergence based on change in loss.
        """
        self.Y = Y
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.batch_size = batch_size
        self.tolerance = tolerance  # Tolerance for convergence

        # Initialize user and item dimensions
        self.n_users = int(np.max(Y[:, 0]) + 1)
        self.n_items = int(np.max(Y[:, 1]) + 1)

        # Initialize latent factors and biases
        self.H = np.random.normal(0, 0.1, (self.n_users, K)).astype(np.float32)  # Latent factors for users
        self.Q = np.random.normal(0, 0.1, (self.n_items, K)).astype(np.float32)  # Latent factors for items
        self.o = np.zeros(self.n_users, dtype=np.float32)  # Biases for users
        self.p = np.zeros(self.n_items, dtype=np.float32)  # Biases for items
        self.mu = np.mean(Y[:, 2])  # Global average rating

    def compute_loss(self):
        """
        Compute the loss based on the provided formula.
        """
        n_ratings = self.Y.shape[0]
        indices = np.random.choice(n_ratings, self.batch_size, replace=False)
        batch = self.Y[indices]

        user_ids = batch[:, 0].astype(int)
        item_ids = batch[:, 1].astype(int)
        ratings = batch[:, 2]

        # Calculate predictions
        pred = self.o[user_ids] + self.p[item_ids] + self.mu + np.sum(self.H[user_ids] * self.Q[item_ids], axis=1)
        
        error = pred - ratings
        loss = 0.5 * np.mean(error ** 2)

        # Add regularization terms
        loss += 0.5 * self.lam * (np.sum(self.H ** 2) + np.sum(self.Q ** 2))
        loss += 0.5 * self.lam * np.sum(self.o ** 2)
        loss += 0.5 * self.lam * np.sum(self.p ** 2)
        return loss

    def fit(self):
        """
        Train the model using mini-batch stochastic gradient descent (SGD).
        """
        prev_loss = float('inf')
        
        for it in range(self.max_iter):
            n_ratings = self.Y.shape[0]
            indices = np.arange(n_ratings)
            np.random.shuffle(indices)

            for i in range(0, n_ratings, self.batch_size):
                batch_indices = indices[i:i + self.batch_size]
                batch = self.Y[batch_indices]

                user_ids = batch[:, 0].astype(int)
                item_ids = batch[:, 1].astype(int)
                ratings = batch[:, 2]

                # Compute predictions
                pred = self.o[user_ids] + self.p[item_ids] + self.mu + np.sum(self.H[user_ids] * self.Q[item_ids], axis=1)
                error = pred - ratings

                # Initialize gradients
                grad_H = np.zeros_like(self.H)
                grad_Q = np.zeros_like(self.Q)
                grad_o = np.zeros_like(self.o)
                grad_p = np.zeros_like(self.p)

                # Compute gradients for each user-item pair in the batch
                for u, i, r in zip(user_ids, item_ids, ratings):
                    error_term = error[user_ids == u][0]  # The error term for this specific user-item pair
                    
                    # Update gradients for user and item latent factors
                    grad_H[u] += error_term * self.Q[i]
                    grad_Q[i] += error_term * self.H[u]
                    
                    # Update gradients for user and item biases
                    grad_o[u] += error_term
                    grad_p[i] += error_term

                # Apply regularization to gradients
                grad_H += self.lam * self.H
                grad_Q += self.lam * self.Q
                grad_o += self.lam * self.o
                grad_p += self.lam * self.p

                # Update parameters using np.add.at to handle duplicates in mini-batches
                np.add.at(self.H, user_ids, -self.learning_rate * grad_H[user_ids])
                np.add.at(self.Q, item_ids, -self.learning_rate * grad_Q[item_ids])
                np.add.at(self.o, user_ids, -self.learning_rate * grad_o[user_ids])
                np.add.at(self.p, item_ids, -self.learning_rate * grad_p[item_ids])

            # Compute current loss and check for convergence
            loss = self.compute_loss()

            # Check if the change in loss is smaller than the tolerance
            if abs(prev_loss - loss) < self.tolerance:
                print(f"Convergence reached at iteration {it + 1}")
                break

            prev_loss = loss

            # Print loss every 'print_every' iterations
            if (it + 1) % self.print_every == 0:
                print(f"Iteration {it + 1}/{self.max_iter}, Loss: {loss:.4f}")

    def predict(self, u, i):
        """
        Predict the rating for a specific user-item pair.
        """
        u, i = int(u), int(i)
        pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
        return np.clip(pred, 0, 5)

    def evaluate_rmse(self, test_data):
        """
        Compute RMSE on the test set.
        """
        n_tests = test_data.shape[0]
        squared_error = 0
        for n in range(n_tests):
            pred = self.predict(test_data[n, 0], test_data[n, 1])
            squared_error += (pred - test_data[n, 2]) ** 2
        rmse = np.sqrt(squared_error / n_tests)
        return rmse


## Test the results
RMSE

In [None]:
# Load data
data_path = '../data/ml-latest-small/ratings.csv'
data = pd.read_csv(data_path)
data = data.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'})
data = data[['user_id', 'movie_id', 'rating']]

# Split data into train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
rate_train = train_data.to_numpy()
rate_test = test_data.to_numpy()

# Adjust indices to be 0-based
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

# Train the model
mf = MFOptimized(rate_train, K, lam, learning_rate, max_iter, print_every, batch_size, tolerance)
mf.fit()

# Evaluate the model
rmse = mf.evaluate_rmse(rate_test)
print(f"\nOptimized MF, RMSE: {rmse:.4f}")

Iteration 1/20, Loss: 1166.2212
Iteration 2/20, Loss: 1166.2885
Iteration 3/20, Loss: 1166.4358
Iteration 4/20, Loss: 1166.4878
Iteration 5/20, Loss: 1166.5428
Iteration 6/20, Loss: 1166.6224
Iteration 7/20, Loss: 1166.7377
Iteration 8/20, Loss: 1166.6724
Iteration 9/20, Loss: 1166.7292
Iteration 10/20, Loss: 1166.7745
Iteration 11/20, Loss: 1166.7675
Iteration 12/20, Loss: 1166.8196
Iteration 13/20, Loss: 1166.8366
Iteration 14/20, Loss: 1166.9293
Iteration 15/20, Loss: 1166.9045
Iteration 16/20, Loss: 1166.9760
Iteration 17/20, Loss: 1166.9934
Iteration 18/20, Loss: 1167.1477
Iteration 19/20, Loss: 1167.0711
Iteration 20/20, Loss: 1167.1373

Optimized MF, RMSE: 0.9415


Generate a prediction for a user

In [18]:
def generate_predictions_for_user(model, user_id, n_items):
    predictions = [(user_id + 1, item + 1, model.predict(user_id, item)) for item in range(n_items)]
    return predictions

Make comparision of real ratings and model's predicted ratings for a user

In [19]:
def create_comparison_csv(model, user_id, data, n_items, output_path):
    user_ratings = data[data['user_id'] == user_id + 1][['user_id', 'movie_id', 'rating']]
    predictions = generate_predictions_for_user(model, user_id, n_items)
    predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
    comparison_df = pd.merge(user_ratings, predictions_df, on=['user_id', 'movie_id'])
    comparison_df.to_csv(output_path, index=False)
    print(f"Comparison CSV saved to {output_path}")

Test it

In [20]:
user_id = 0
n_items = mf.n_items
predictions = generate_predictions_for_user(mf, user_id, n_items)
predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
predictions_output_path = f'./data/output/predicted_ratings_user_{user_id}.csv'
predictions_df.to_csv(predictions_output_path, index=False)
print(f'Predicted ratings for user {user_id + 1} saved to {predictions_output_path}')

comparison_output_path = f'./data/output/rating_comparison_user_{user_id}.csv'
create_comparison_csv(mf, user_id, data, n_items, comparison_output_path)

Predicted ratings for user 1 saved to ./data/output/predicted_ratings_user_0.csv
Comparison CSV saved to ./data/output/rating_comparison_user_0.csv
