# Use matrix factorization for recommender system
## Dependencies

In [9]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix

## Hyper parametter

In [10]:
K=60 # latent factors
lam=0.02 # regularization
learning_rate=0.001 # learning rate
max_iter=20 # max iterations
print_every=1 # print loss for each iteration
batch_size=1000 # batch size
tolerance=1e-6 # tolerance


## Create class MF

In [11]:
import numpy as np

class MFOptimized:
    def __init__(self, Y, K, lam=0.1, learning_rate=0.01, max_iter=100, print_every=10, tolerance=1e-6):
        """
        Initialize the MF model.
        Y: numpy array, shape (n_ratings, 3), each row [user_id, item_id, rating].
        K: Number of latent factors.
        lam: Regularization parameter.
        learning_rate: Learning rate for gradient descent.
        max_iter: Number of training iterations.
        print_every: Print loss every print_every iterations.
        tolerance: Tolerance for convergence based on change in loss.
        """
        self.Y = Y
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.tolerance = tolerance  # Tolerance for convergence

        # Initialize user and item dimensions
        self.n_users = int(np.max(Y[:, 0]) + 1)
        self.n_items = int(np.max(Y[:, 1]) + 1)

        # Initialize latent factors and biases
        self.H = np.random.normal(0, 0.1, (self.n_users, K)).astype(np.float32)  # Latent factors for users
        self.Q = np.random.normal(0, 0.1, (self.n_items, K)).astype(np.float32)  # Latent factors for items
        self.o = np.zeros(self.n_users, dtype=np.float32)  # Biases for users
        self.p = np.zeros(self.n_items, dtype=np.float32)  # Biases for items
        self.mu = np.mean(Y[:, 2])  # Global average rating

    def compute_loss(self):
        """
        Compute the loss based on the provided formula.
        """
        n_ratings = self.Y.shape[0]
        error_sum = 0
        regularization_sum = 0

        for n in range(n_ratings):
            u, i, r = int(self.Y[n, 0]), int(self.Y[n, 1]), self.Y[n, 2]
            pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
            error = r - pred
            error_sum += error ** 2
            regularization_sum += np.sum(self.H[u] ** 2) + np.sum(self.Q[i] ** 2) + self.o[u] ** 2 + self.p[i] ** 2

        # Compute total loss
        loss = 0.5 * error_sum / n_ratings + 0.5 * self.lam * regularization_sum
        return loss

    def fit(self):
        """
        Train the model using stochastic gradient descent (SGD).
        """
        prev_loss = float('inf')

        for it in range(self.max_iter):
            np.random.shuffle(self.Y)  # Shuffle the data at the start of each epoch

            for n in range(self.Y.shape[0]):
                u, i, r = int(self.Y[n, 0]), int(self.Y[n, 1]), self.Y[n, 2]
                
                # Calculate prediction
                pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])

                # Calculate error
                error = r - pred

                # Update latent factors and biases
                self.H[u] += self.learning_rate * (error * self.Q[i] - self.lam * self.H[u])
                self.Q[i] += self.learning_rate * (error * self.H[u] - self.lam * self.Q[i])

                self.o[u] += self.learning_rate * (error - self.lam * self.o[u])
                self.p[i] += self.learning_rate * (error - self.lam * self.p[i])

            # Compute current loss and check for convergence
            loss = self.compute_loss()

            # Check if the change in loss is smaller than the tolerance
            if abs(prev_loss - loss) < self.tolerance:
                print(f"Convergence reached at iteration {it + 1}")
                break

            prev_loss = loss

            # Print loss every 'print_every' iterations
            if (it + 1) % self.print_every == 0:
                print(f"Iteration {it + 1}/{self.max_iter}, Loss: {loss:.4f}")

    def predict(self, u, i):
        """
        Predict the rating for a specific user-item pair.
        """
        u, i = int(u), int(i)
        pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
        return np.clip(pred, 0, 5)

    def evaluate_rmse(self, test_data):
        """
        Compute RMSE on the test set.
        """
        n_tests = test_data.shape[0]
        squared_error = 0
        for n in range(n_tests):
            pred = self.predict(test_data[n, 0], test_data[n, 1])
            squared_error += (pred - test_data[n, 2]) ** 2
        rmse = np.sqrt(squared_error / n_tests)
        return rmse

## Test the results
RMSE

In [12]:
# Load data
data_path = '../data/ml-latest-small/ratings.csv'
data = pd.read_csv(data_path)
data = data.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'})
data = data[['user_id', 'movie_id', 'rating']]

# Split data into train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
rate_train = train_data.to_numpy()
rate_test = test_data.to_numpy()

# Adjust indices to be 0-based
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

# Train the model
mf = MFOptimized(rate_train, K, lam, learning_rate, max_iter, print_every, batch_size, tolerance)
mf.fit()

# Evaluate the model
rmse = mf.evaluate_rmse(rate_test)
print(f"\nOptimized MF, RMSE: {rmse:.4f}")

Iteration 1/20, Loss: 1166.3543
Iteration 2/20, Loss: 1166.4425
Iteration 3/20, Loss: 1166.5765
Iteration 4/20, Loss: 1166.5800
Iteration 5/20, Loss: 1166.7403
Iteration 6/20, Loss: 1166.8071
Iteration 7/20, Loss: 1166.8683
Iteration 8/20, Loss: 1166.8941
Iteration 9/20, Loss: 1166.8294
Iteration 10/20, Loss: 1166.9319
Iteration 11/20, Loss: 1166.9864
Iteration 12/20, Loss: 1167.0069
Iteration 13/20, Loss: 1167.0579
Iteration 14/20, Loss: 1167.0511
Iteration 15/20, Loss: 1167.1286
Iteration 16/20, Loss: 1167.1216
Iteration 17/20, Loss: 1167.1643
Iteration 18/20, Loss: 1167.2084
Iteration 19/20, Loss: 1167.2029
Iteration 20/20, Loss: 1167.2086

Optimized MF, RMSE: 0.9337


Generate a prediction for a user

In [13]:
def generate_predictions_for_user(model, user_id, n_items):
    predictions = [(user_id + 1, item + 1, model.predict(user_id, item)) for item in range(n_items)]
    return predictions

Make comparision of real ratings and model's predicted ratings for a user

In [14]:
def create_comparison_csv(model, user_id, data, n_items, output_path):
    user_ratings = data[data['user_id'] == user_id + 1][['user_id', 'movie_id', 'rating']]
    predictions = generate_predictions_for_user(model, user_id, n_items)
    predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
    predictions_df['predicted_rating'] = predictions_df['predicted_rating'].round(2)
    comparison_df = pd.merge(user_ratings, predictions_df, on=['user_id', 'movie_id'])
    comparison_df.to_csv(output_path, index=False)
    print(f"Comparison CSV saved to {output_path}")


Test it

In [17]:
user_id = 100

n_items = mf.n_items
predictions = generate_predictions_for_user(mf, user_id, n_items)
predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
predictions_output_path = f'./data/output/predicted_ratings_user_{user_id}.csv'
predictions_df.to_csv(predictions_output_path, index=False)
print(f'Predicted ratings for user {user_id + 1} saved to {predictions_output_path}')

comparison_output_path = f'./data/output/rating_comparison_user_{user_id}.csv'
create_comparison_csv(mf, user_id, data, n_items, comparison_output_path)

Predicted ratings for user 101 saved to ./data/output/predicted_ratings_user_100.csv
Comparison CSV saved to ./data/output/rating_comparison_user_100.csv
