# Use matrix factorization for recommender system
## Dependencies

In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix
import os, csv
from sklearn.metrics import precision_score, recall_score, f1_score

## Hyperparametters

In [3]:
K=60 # latent factors
lam=0.02 # regularization
learning_rate=0.001 # learning rate
max_iter=200 # max iterations
print_every=1 # print loss for each iteration
tolerance=1e-6 # tolerance
beta=0.4 # min bound to normalized

## Create class MF

In [20]:

class MFOptimized:
    def __init__(self, Y: pd.DataFrame, K, lam=0.1, learning_rate=0.01, max_iter=100, print_every=10, tolerance=1e-6):
        """
        Initialize the MF model.
        Args:
            Y (pandas array): A 2D numpy array of shape (n_users, n_items) representing the rating matrix.
                Missing ratings should be represented as 0.
            K (int): Number of latent factors.
            lam (float): Regularization parameter.
            learning_rate (float): Learning rate for gradient descent.
            max_iter (int): Number of training iterations.
            print_every (int): Print loss every print_every iterations.
            tolerance (float): Tolerance for convergence based on change in loss.
        """
        self.Y = Y
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.tolerance = tolerance

        # Dimensions of the rating matrix
        self.n_users, self.n_items = Y.shape

        # Initialize latent factors and biases
        self.H = np.random.normal(0, 0.1, (self.n_users, K)).astype(np.float32)  # Latent factors for users
        self.Q = np.random.normal(0, 0.1, (self.n_items, K)).astype(np.float32)  # Latent factors for items)
        self.o = np.zeros(self.n_users, dtype=np.float32)  # Biases for users
        self.p = np.zeros(self.n_items, dtype=np.float32)  # Biases for items
        self.mu = np.mean(Y[Y > 0])  # Global average rating (non-zero entries only)

    def map_ids_to_indices(self):
        """
        Map original user and movie IDs to their corresponding indices in the pivoted matrix.
        
        Args:
            data: DataFrame containing ratings, with userId as index and movieId as columns.
        
        Returns:
            user_id_to_index: Dictionary mapping userId to row index.
            movie_id_to_index: Dictionary mapping movieId to column index.
            index_to_user_id: Dictionary mapping row index to userId.
            index_to_movie_id: Dictionary mapping column index to movieId.
        """
        # Create mappings for userId and movieId
        self.user_id_to_index = {user_id: idx for idx, user_id in enumerate(self.Y.index)}
        self.movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(self.Y.columns)}
        
        # Reverse mappings
        self.index_to_user_id = {idx: user_id for user_id, idx in self.user_id_to_index.items()}
        self.index_to_movie_id = {idx: movie_id for movie_id, idx in self.movie_id_to_index.items()}
        self.Y = self.Y.to_numpy()

    def compute_loss(self):
        """
        Compute the loss based on the provided formula.
        """
        mask = self.Y > 0  # Mask to filter out missing ratings
        error_sum = 0
        regularization_sum = 0

        for u in range(self.n_users):
            for i in range(self.n_items):
                if mask[u, i]:
                    r = self.Y[u, i]
                    pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
                    error = r - pred
                    error_sum += error ** 2
                    regularization_sum += (
                        np.sum(self.H[u] ** 2)
                        + np.sum(self.Q[i] ** 2)
                        + self.o[u] ** 2
                        + self.p[i] ** 2
                    )

        # Compute total loss
        loss = 0.5 * error_sum + 0.5 * self.lam * regularization_sum
        return loss

    def fit(self):
        """
        Train the model using stochastic gradient descent (SGD).
        """
        prev_loss = float('inf')

        for it in range(self.max_iter):
            for u in range(self.n_users):
                for i in range(self.n_items):
                    if self.Y[u, i] > 0:  # Update only for observed ratings
                        r = self.Y[u, i]

                        # Calculate prediction
                        pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])

                        # Calculate error
                        error = r - pred

                        # Update latent factors and biases
                        self.H[u] += self.learning_rate * (error * self.Q[i] - self.lam * self.H[u])
                        self.Q[i] += self.learning_rate * (error * self.H[u] - self.lam * self.Q[i])

                        self.o[u] += self.learning_rate * (error - self.lam * self.o[u])
                        self.p[i] += self.learning_rate * (error - self.lam * self.p[i])

            # Compute current loss and check for convergence
            loss = self.compute_loss()

            # Check if the change in loss is smaller than the tolerance
            if abs(prev_loss - loss) < self.tolerance:
                print(f"Convergence reached at iteration {it + 1}")
                break

            prev_loss = loss

            # Print loss every 'print_every' iterations
            if (it + 1) % self.print_every == 0:
                print(f"Iteration {it + 1}/{self.max_iter}, Loss: {loss:.4f}")

    def predict(self, u, i):
        """
        Predict the rating for a specific user-item pair.
        """
        u, i = int(u), int(i)
        pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
        return np.clip(pred, 0, 5)


    def export_latent_matrices_and_biases(self, output_dir="data/output"):
        """
        Export the latent matrices (H, Q) and biases (o, p) to CSV files.
        Args:
            output_dir (str): Directory where the files will be saved.
        """ 
        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

        # Save user latent factors (H)
        np.savetxt(os.path.join(output_dir, "user_latent_factors.csv"), self.H, delimiter=",")
        print(f"User latent factors saved to {os.path.join(output_dir, 'user_latent_factors.csv')}")

        # Save item latent factors (Q)
        np.savetxt(os.path.join(output_dir, "item_latent_factors.csv"), self.Q, delimiter=",")
        print(f"Item latent factors saved to {os.path.join(output_dir, 'item_latent_factors.csv')}")

        # Save user biases (o)
        np.savetxt(os.path.join(output_dir, "user_biases.csv"), self.o, delimiter=",")
        print(f"User biases saved to {os.path.join(output_dir, 'user_biases.csv')}")

        # Save item biases (p)
        np.savetxt(os.path.join(output_dir, "item_biases.csv"), self.p, delimiter=",")
        print(f"Item biases saved to {os.path.join(output_dir, 'item_biases.csv')}")

        # Save global mean (mu)
        with open(os.path.join(output_dir, "global_mean.txt"), "w") as f:
            f.write(str(self.mu))
        print(f"Global mean saved to {os.path.join(output_dir, 'global_mean.txt')}")

    def load_latent_matrices_and_biases(self, input_dir="data/output"):
        """
        Load the latent matrices (H, Q) and biases (o, p) from CSV files.
        Args:
            input_dir (str): Directory where the files are saved.
        """
        try:
            # Load user latent factors (H)
            self.H = np.loadtxt(os.path.join(input_dir, "user_latent_factors.csv"), delimiter=",")
            print(f"User latent factors loaded from {os.path.join(input_dir, 'user_latent_factors.csv')}")

            # Load item latent factors (Q)
            self.Q = np.loadtxt(os.path.join(input_dir, "item_latent_factors.csv"), delimiter=",")
            print(f"Item latent factors loaded from {os.path.join(input_dir, 'item_latent_factors.csv')}")

            # Load user biases (o)
            self.o = np.loadtxt(os.path.join(input_dir, "user_biases.csv"), delimiter=",")
            print(f"User biases loaded from {os.path.join(input_dir, 'user_biases.csv')}")

            # Load item biases (p)
            self.p = np.loadtxt(os.path.join(input_dir, "item_biases.csv"), delimiter=",")
            print(f"Item biases loaded from {os.path.join(input_dir, 'item_biases.csv')}")

            # Load global mean (mu)
            with open(os.path.join(input_dir, "global_mean.txt"), "r") as f:
                self.mu = float(f.read())
            print(f"Global mean loaded from {os.path.join(input_dir, 'global_mean.txt')}")

        except Exception as e:
            print(f"An error occurred while loading latent matrices and biases: {e}")

    def evaluate(self, threshold=3):
        """
        Evaluate the model using Precision, Recall, and F1 score.
        Args:
            threshold (int): The rating threshold to consider a "positive" prediction.
        """
        y_true = []
        y_pred = []

        # Iterate through all users and items
        for u in range(self.n_users):
            for i in range(self.n_items):
                if self.Y[u, i] > 0:  # Only evaluate on observed ratings
                    observed_rating = self.Y[u, i]
                    predicted_rating = self.predict(u, i)

                    # Convert ratings to binary (1 if positive, 0 if negative)
                    y_true.append(1 if observed_rating >= threshold else 0)
                    y_pred.append(1 if predicted_rating >= threshold else 0)

        # Compute Precision, Recall, and F1 Score
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    def make_predicted_matrix(self):
        self.predicted_ratings = np.zeros((self.n_users, self.n_items))

        for u in range(self.n_users):
            for i in range(self.n_items):
                # Compute the predicted rating for each user-item pair
                prediction = (self.o[u] + self.p[i] + self.mu +
                            np.dot(self.H[u], self.Q[i]))
                self.predicted_ratings[u, i] = np.clip(prediction, 0, 5)  # Clip to a valid rating range (0 to 5)

    def export_ratings(self, output_filename="predicted_ratings.csv"):
        """
        Export the predicted and observed ratings to a CSV file, using original userId and movieId.
        Args:
            output_filename (str): Name of the output CSV file.
        """
        rows = []
        
        for u in range(self.n_users):
            for i in range(self.n_items):
                if self.Y[u, i] > 0:  # Only consider observed ratings
                    observed_rating = self.Y[u, i]
                    predicted_rating = self.predict(u, i)
                    
                    # Map the indices back to original userId and movieId
                    original_user_id = self.index_to_user_id[u]
                    original_movie_id = self.index_to_movie_id[i]
                    
                    rows.append([original_user_id, original_movie_id, predicted_rating, observed_rating])

        # Create DataFrame and save to CSV
        df = pd.DataFrame(rows, columns=["userId", "movieId", "predicted_ratings", "observed_ratings"])
        df.to_csv(output_filename, index=False)
        print(f"Predicted ratings saved to {output_filename}")

## Preprocessing step

## Offline

Compute $c^{[beta,1]}_{u,i}$ matrix

In [5]:
def compute_normalized_c_ui(Y, predicted_ratings, user_latent_factors, item_latent_factors, beta=0.4):
    """
    Compute and normalize c_{u,i} for the entire matrix.

    Args:
        Y (numpy.ndarray): Rating matrix (n_users x n_items), with 0 for missing ratings.
        predicted_ratings (numpy.ndarray): Predicted ratings matrix (n_users x n_items).
        user_latent_factors (numpy.ndarray): User latent factors matrix (n_users x K).
        item_latent_factors (numpy.ndarray): Item latent factors matrix (n_items x K).
        beta (float): Minimum bound for normalization (default: 0.4).

    Returns:
        numpy.ndarray: Normalized c_{u,i} matrix (n_users x n_items).
    """
    n_users, n_items = Y.shape
    c_ui = np.zeros((n_users, n_items), dtype=np.float32)

    # Compute raw c_{u,i}
    for u in range(n_users):
        for i in range(n_items):
            if Y[u, i] > 0:  # Observed rating
                c_ui[u, i] = 1 - abs(Y[u, i] - predicted_ratings[u, i])
            else:  # Predicted rating
                latent_prediction = 2 * np.dot(user_latent_factors[u], item_latent_factors[i])
                c_ui[u, i] = 1 - abs(predicted_ratings[u, i] - latent_prediction)

    # Normalize c_{u,i} for each item
    c_ui_normalized = np.zeros_like(c_ui)
    for i in range(n_items):
        col = c_ui[:, i]
        col_min = np.min(col)
        col_max = np.max(col)

        if col_max > col_min:  # Avoid division by zero
            c_ui_normalized[:, i] = beta + (1 - beta) * (col - col_min) / (col_max - col_min)
        else:  # If all values are the same, set them to beta
            c_ui_normalized[:, i] = beta

    return c_ui_normalized


Aggragating Profile use AOFRAM & W

In [12]:
import numpy as np
import random

def create_virtual_profile(Y, predicted_ratings, c_ui_matrix, group_size=3, beta=0.4):
    """
    Create a virtual profile by aggregating the profiles of a group of users.

    Args:
        Y: 2D numpy array (users x items), the rating matrix with real ratings (0 for missing values).
        predicted_ratings: 2D numpy array (users x items), predicted ratings for all users and items.
        user_latent_factors: 2D numpy array (users x latent factors), user latent factor matrix.
        item_latent_factors: 2D numpy array (items x latent factors), item latent factor matrix.
        c_ui_matrix: Precomputed normalized c_{u,i} matrix (users x items).
        group_size: Size of the group to create.

    Returns:
        group: index of users
        virtual_profile
    """
    # Select a random group of users of the specified size
    n_users = Y.shape[0]
    group = random.sample(range(n_users), group_size)
    
    print(f"Selected group (user indices): {group}")

    # Initialize virtual profile
    n_items = Y.shape[1]
    virtual_profile = np.zeros(n_items)

    # Compute observed ratings and counts
    observed_ratings_count = np.sum(Y > 0, axis=1)  # k_u for each user

    # Normalize observed ratings count to the range [0.4, 1]
    min_val, max_val = np.min(observed_ratings_count), np.max(observed_ratings_count)
    normalized_k_u = beta + (1 - beta) * (observed_ratings_count - min_val) / (max_val - min_val)

    for item in range(n_items):
        # Check if at least one user in the group has an observed rating
        has_real_rating = any(Y[u, item] > 0 for u in group)

        if not has_real_rating:
            # If no real ratings exist for the item, leave it as 0
            virtual_profile[item] = 0
            continue

        # Aggregate ratings
        numerator = 0
        denominator = 0
        for u in group:
            k_u = normalized_k_u[u]
            s_u_i = Y[u, item] if Y[u, item] > 0 else predicted_ratings[u, item]
            c_u_i = c_ui_matrix[u, item]  # Use precomputed normalized c_{u,i}

            weight = k_u * c_u_i
            numerator += weight * s_u_i
            denominator += weight

        virtual_profile[item] = numerator / denominator if denominator > 0 else 0

    # Append the virtual profile to the rating matrix
    # Y_with_virtual = np.vstack([Y, virtual_profile])

    return group, virtual_profile


## Test the results
Load the model

In [23]:
# Load data
data_path = '../data/ml-latest-small/ratings.csv'
data = pd.read_csv(data_path)

# Pivot to summarise and count
data = data.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
# data.to_csv("test.csv")
# # Split data into train and test


# # Train the model
mf = MFOptimized(data, K, lam, learning_rate, max_iter, print_every, tolerance)
mf.map_ids_to_indices()
# mf.fit()
mf.load_latent_matrices_and_biases()
# # # Evaluate the model
mf.evaluate()
# mf.export_ratings("./data/output/predicted_ratings.csv")
# # print(f"\nOptimized MF, RMSE: {rmse:.4f}")
# mf.export_latent_matrices_and_biases("data/output")

User latent factors loaded from data/output/user_latent_factors.csv
Item latent factors loaded from data/output/item_latent_factors.csv
User biases loaded from data/output/user_biases.csv
Item biases loaded from data/output/item_biases.csv
Global mean loaded from data/output/global_mean.txt
Precision: 0.9718
Recall: 0.9157
F1 Score: 0.9429


Make predicted matrix

In [26]:
# k_u = compute_user_ratings(mf.Y,mf.n_users)
# mf.export_ratings("data/output/predicted_ratings.csv")
mf.make_predicted_matrix()
# x = create_virtualL_profile(mf.Y,)

Make c_ui matrix

In [None]:
# c_ui_matrix = compute_normalized_c_ui(mf.Y,mf.predicted_ratings, mf.o, mf.p)

[[0.94420826 0.47402394 0.9671103  ... 0.4999237  0.47488588 0.45292556]
 [0.5301836  0.58072484 0.6274838  ... 0.5872026  0.5531991  0.5541628 ]
 [0.5261104  0.6782334  0.81148946 ... 0.7657037  0.63769907 0.6792286 ]
 ...
 [0.98954535 0.97414136 0.9795065  ... 0.6119779  0.59915584 0.6239989 ]
 [0.95671225 0.62834036 0.6779742  ... 0.5938318  0.6141912  0.57082635]
 [0.9342431  0.5637084  0.6238071  ... 0.58654726 0.5603328  0.54916537]]
Selected group (user indices): [166, 405, 511]
[[4.         0.         4.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [3.         0.         0.         ... 0.         0.         0.        ]
 [5.         0.         0.         ... 0.         0.         0.        ]
 [3.63722258 3.20148182 0.         ... 0.         0.         0.        ]]


Save, or load the c_ui

In [33]:
# print(c_ui_matrix)
output_dir = "data/output"
# np.savetxt(os.path.join(output_dir, "c_ui_matrix.csv"), c_ui_matrix, delimiter=",")
# print(f"Saved to {os.path.join(output_dir, 'c_ui_matrix.csv')}")
c_ui_matrix =np.loadtxt(os.path.join(output_dir, "c_ui_matrix.csv"), delimiter=",")
print(f"Saved to {os.path.join(output_dir, 'c_ui_matrix.csv')}")

Saved to data/output/c_ui_matrix.csv


## Online

Make a virtual profile

In [34]:
group, virtual_profile = create_virtual_profile(mf.Y, mf.predicted_ratings, c_ui_matrix, group_size=2)
print(group)
print(virtual_profile)

Selected group (user indices): [211, 154]
[211, 154]
[3.28149398 0.         0.         ... 0.         0.         0.        ]


Export the profile, or load it

In [37]:
output_dir = "data/output/online/group_2"
group = [211, 154]
np.savetxt(os.path.join(output_dir, "virtual_profile.csv"), virtual_profile, delimiter=",")
print(f"Saved to {os.path.join(output_dir, 'virtual_profile.csv')}")
# np.loadtxt(os.path.join(output_dir, "virtual_profile.csv"), virtual_profile, delimiter=",")
# print(f"Saved to {os.path.join(output_dir, 'virtual_profile.csv')}")

Saved to data/output/online/group_2/virtual_profile.csv


Add it to the rating matrix, then give predictions for that

In [38]:
mf.Y = np.vstack([mf.Y, virtual_profile])
mf.n_users+=1
mf.H = np.random.normal(0, 0.1, (mf.n_users, K)).astype(np.float32)  # Latent factors for users
mf.Q = np.random.normal(0, 0.1, (mf.n_items, K)).astype(np.float32)  # Latent factors for items)
mf.o = np.zeros(mf.n_users, dtype=np.float32)  # Biases for users
mf.p = np.zeros(mf.n_items, dtype=np.float32)  # Biases for items
mf.mu = np.mean(mf.Y[mf.Y > 0])  # Global average rating (non-zero entries only)
mf.fit()

Iteration 1/200, Loss: 49702.9911
Iteration 2/200, Loss: 47155.7659
Iteration 3/200, Loss: 45573.2438
Iteration 4/200, Loss: 44450.3956
Iteration 5/200, Loss: 43590.6756
Iteration 6/200, Loss: 42897.4370
Iteration 7/200, Loss: 42317.1321
Iteration 8/200, Loss: 41817.6374
Iteration 9/200, Loss: 41378.5063
Iteration 10/200, Loss: 40986.0465
Iteration 11/200, Loss: 40630.7216
Iteration 12/200, Loss: 40305.6170
Iteration 13/200, Loss: 40005.5593
Iteration 14/200, Loss: 39726.5835
Iteration 15/200, Loss: 39465.5874
Iteration 16/200, Loss: 39220.0700
Iteration 17/200, Loss: 38988.0072
Iteration 18/200, Loss: 38767.7292
Iteration 19/200, Loss: 38557.8262
Iteration 20/200, Loss: 38357.1210
Iteration 21/200, Loss: 38164.6184
Iteration 22/200, Loss: 37979.4073
Iteration 23/200, Loss: 37800.7451
Iteration 24/200, Loss: 37627.9577
Iteration 25/200, Loss: 37460.4645
Iteration 26/200, Loss: 37297.7256
Iteration 27/200, Loss: 37139.2865
Iteration 28/200, Loss: 36984.7302
Iteration 29/200, Loss: 36833

Export the results

In [39]:
mf.evaluate()
mf.user_id_to_index[mf.n_users] =mf.n_users - 1
mf.index_to_user_id[mf.n_users - 1] = mf.n_users
mf.export_ratings(f"{output_dir}/predicted_ratings.csv")
# # print(f"\nOptimized MF, RMSE: {rmse:.4f}")
mf.export_latent_matrices_and_biases(f"{output_dir}")

Precision: 0.9717
Recall: 0.9161
F1 Score: 0.9431
Predicted ratings saved to data/output/online/group_2/predicted_ratings.csv
User latent factors saved to data/output/online/group_2/user_latent_factors.csv
Item latent factors saved to data/output/online/group_2/item_latent_factors.csv
User biases saved to data/output/online/group_2/user_biases.csv
Item biases saved to data/output/online/group_2/item_biases.csv
Global mean saved to data/output/online/group_2/global_mean.txt
