# Use matrix factorization for recommender system
## Dependencies

In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split

## Create class MF

In [9]:
import numpy as np
from scipy.sparse import csr_matrix

class MFOptimized(object):
    def __init__(self, Y, K, lam=0.1, learning_rate=0.01, max_iter=100, print_every=10, batch_size=1000):
        self.Y = Y  # rating data: user_id, item_id, rating
        self.K = K  # latent dimension
        self.lam = lam  # regularization parameter
        self.learning_rate = learning_rate  # learning rate
        self.max_iter = max_iter  # number of iterations
        self.print_every = print_every  # print loss every few iterations
        self.batch_size = batch_size  # batch size for processing
        
        # Initialize dimensions
        self.n_users = int(np.max(Y[:, 0]) + 1)
        self.n_items = int(np.max(Y[:, 1]) + 1)
        
        # Initialize parameters (float32 for memory efficiency)
        self.X = np.random.randn(self.n_items, K).astype(np.float32)  # item latent factors
        self.W = np.random.randn(K, self.n_users).astype(np.float32)  # user latent factors
        self.b = np.zeros(self.n_items, dtype=np.float32)  # item bias
        self.d = np.zeros(self.n_users, dtype=np.float32)  # user bias
    
    def loss(self):
        """
        Compute the loss function with regularization
        """
        n_ratings = self.Y.shape[0]
        indices = np.random.choice(n_ratings, self.batch_size, replace=False)
        batch = self.Y[indices]
        
        user_ids = batch[:, 0].astype(int)
        item_ids = batch[:, 1].astype(int)
        ratings = batch[:, 2]
        
        pred = self.X[item_ids].dot(self.W[:, user_ids]) + self.b[item_ids][:, None] + self.d[user_ids]
        error = pred - ratings[:, None]
        loss = 0.5 * np.mean(error ** 2)
        
        # Add regularization
        loss += 0.5 * self.lam * (np.sum(self.X ** 2) + np.sum(self.W ** 2))
        return loss
    
    def fit(self):
        """
        Train the MF model using mini-batch SGD
        """
        for it in range(self.max_iter):
            n_ratings = self.Y.shape[0]
            indices = np.arange(n_ratings)
            np.random.shuffle(indices)
            
            for i in range(0, n_ratings, self.batch_size):
                batch_indices = indices[i:i + self.batch_size]
                batch = self.Y[batch_indices]
                
                user_ids = batch[:, 0].astype(int)
                item_ids = batch[:, 1].astype(int)
                ratings = batch[:, 2]
                
                # Compute predictions
                pred = self.X[item_ids].dot(self.W[:, user_ids]) + self.b[item_ids][:, None] + self.d[user_ids]
                error = pred - ratings[:, None]
                
                # Gradients
                grad_X = (error.dot(self.W[:, user_ids].T) + self.lam * self.X[item_ids]) / self.batch_size
                grad_W = (self.X[item_ids].T.dot(error) + self.lam * self.W[:, user_ids]) / self.batch_size
                grad_b = np.sum(error, axis=1) / self.batch_size
                grad_d = np.sum(error, axis=0) / self.batch_size
                
                # Updates
                np.add.at(self.X, item_ids, -self.learning_rate * grad_X)
                np.add.at(self.W.T, user_ids, -self.learning_rate * grad_W.T)
                np.add.at(self.b, item_ids, -self.learning_rate * grad_b)
                np.add.at(self.d, user_ids, -self.learning_rate * grad_d)
            
            if (it + 1) % self.print_every == 0:
                print(f"Iteration {it + 1}/{self.max_iter}, Loss: {self.loss():.4f}")
    
    def pred(self, u, i):
        """
        Predict the rating of user u for item i
        """
        u, i = int(u), int(i)
        pred = self.X[i].dot(self.W[:, u]) + self.b[i] + self.d[u]
        return np.clip(pred, 0, 5)
    
    def evaluate_RMSE(self, rate_test):
        """
        Evaluate RMSE on the test set
        """
        n_tests = rate_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2]) ** 2
        RMSE = np.sqrt(SE / n_tests)
        return RMSE


## Test the results

In [10]:
# Load data
data_path = '../data/ml-latest-small/ratings.csv'
data = pd.read_csv(data_path)
data = data.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'})
data = data[['user_id', 'movie_id', 'rating']]

# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
rate_train = train_data.to_numpy()
rate_test = test_data.to_numpy()

# Adjust indices
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

# Train model
mf = MFOptimized(rate_train, K=50, lam=0.01, learning_rate=0.01, max_iter=30, print_every=5)
mf.fit()

# Evaluate model
rmse = mf.evaluate_RMSE(rate_test)
print(f"\nOptimized MF, RMSE: {rmse:.4f}")


Iteration 5/30, Loss: 48272.0219
Iteration 10/30, Loss: 48254.8453
Iteration 15/30, Loss: 48250.8975
Iteration 20/30, Loss: 48249.6982
Iteration 25/30, Loss: 48249.2427
Iteration 30/30, Loss: 48249.0606

Optimized MF, RMSE: 0.9666


# Export results

In [None]:
import pandas as pd

# Dự đoán rating cho tất cả các cặp (user, movie)
def generate_predictions(model, n_users, n_items):
    predictions = []
    for user in range(n_users):
        for item in range(n_items):
            pred_rating = model.pred(user, item)
            predictions.append((user + 1, item + 1, pred_rating))  # Chuyển index về từ 1
    return predictions

# Tạo dự đoán
n_users = mf.n_users
n_items = mf.n_items
predictions = generate_predictions(mf, n_users, n_items)

# Chuyển dự đoán thành DataFrame
predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])

# Xuất ra file CSV
output_path = "predicted_ratings.csv"
predictions_df.to_csv(output_path, index=False)
print(f"Predicted ratings saved to {output_path}")


: 