# Use matrix factorization for recommender system
## Dependencies

In [71]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix
import os, csv
from sklearn.metrics import precision_score, recall_score, f1_score

## Hyperparametters

In [53]:
K=60 # latent factors
lam=0.02 # regularization
learning_rate=0.001 # learning rate
max_iter=5 # max iterations
print_every=1 # print loss for each iteration
tolerance=1e-6 # tolerance
beta=0.4 # min bound to normalized

## Create class MF

In [81]:
import numpy as np
import os

class MFOptimized:
    def __init__(self, Y, K, lam=0.1, learning_rate=0.01, max_iter=100, print_every=10, tolerance=1e-6):
        """
        Initialize the MF model.
        Args:
            Y (numpy array): A 2D numpy array of shape (n_users, n_items) representing the rating matrix.
                Missing ratings should be represented as 0.
            K (int): Number of latent factors.
            lam (float): Regularization parameter.
            learning_rate (float): Learning rate for gradient descent.
            max_iter (int): Number of training iterations.
            print_every (int): Print loss every print_every iterations.
            tolerance (float): Tolerance for convergence based on change in loss.
        """
        self.Y = Y
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.tolerance = tolerance

        # Dimensions of the rating matrix
        self.n_users, self.n_items = Y.shape

        # Initialize latent factors and biases
        self.H = np.random.normal(0, 0.1, (self.n_users, K)).astype(np.float32)  # Latent factors for users
        self.Q = np.random.normal(0, 0.1, (self.n_items, K)).astype(np.float32)  # Latent factors for items
        print(self.H.shape)
        print(self.Q.shape)
        self.o = np.zeros(self.n_users, dtype=np.float32)  # Biases for users
        self.p = np.zeros(self.n_items, dtype=np.float32)  # Biases for items
        self.mu = np.mean(Y[Y > 0])  # Global average rating (non-zero entries only)

    def compute_loss(self):
        """
        Compute the loss based on the provided formula.
        """
        mask = self.Y > 0  # Mask to filter out missing ratings
        error_sum = 0
        regularization_sum = 0

        for u in range(self.n_users):
            for i in range(self.n_items):
                if mask[u, i]:
                    r = self.Y[u, i]
                    pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
                    error = r - pred
                    error_sum += error ** 2
                    regularization_sum += (
                        np.sum(self.H[u] ** 2)
                        + np.sum(self.Q[i] ** 2)
                        + self.o[u] ** 2
                        + self.p[i] ** 2
                    )

        # Compute total loss
        loss = 0.5 * error_sum + 0.5 * self.lam * regularization_sum
        return loss

    def fit(self):
        """
        Train the model using stochastic gradient descent (SGD).
        """
        prev_loss = float('inf')

        for it in range(self.max_iter):
            for u in range(self.n_users):
                for i in range(self.n_items):
                    if self.Y[u, i] > 0:  # Update only for observed ratings
                        r = self.Y[u, i]

                        # Calculate prediction
                        pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])

                        # Calculate error
                        error = r - pred

                        # Update latent factors and biases
                        self.H[u] += self.learning_rate * (error * self.Q[i] - self.lam * self.H[u])
                        self.Q[i] += self.learning_rate * (error * self.H[u] - self.lam * self.Q[i])

                        self.o[u] += self.learning_rate * (error - self.lam * self.o[u])
                        self.p[i] += self.learning_rate * (error - self.lam * self.p[i])

            # Compute current loss and check for convergence
            loss = self.compute_loss()

            # Check if the change in loss is smaller than the tolerance
            if abs(prev_loss - loss) < self.tolerance:
                print(f"Convergence reached at iteration {it + 1}")
                break

            prev_loss = loss

            # Print loss every 'print_every' iterations
            if (it + 1) % self.print_every == 0:
                print(f"Iteration {it + 1}/{self.max_iter}, Loss: {loss:.4f}")

    def predict(self, u, i):
        """
        Predict the rating for a specific user-item pair.
        """
        u, i = int(u), int(i)
        pred = self.o[u] + self.p[i] + self.mu + np.dot(self.H[u], self.Q[i])
        return np.clip(pred, 0, 5)


    def export_latent_matrices_and_biases(self, output_dir="data/output"):
        """
        Export the latent matrices (H, Q) and biases (o, p) to CSV files.
        Args:
            output_dir (str): Directory where the files will be saved.
        """ 
        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

        # Save user latent factors (H)
        np.savetxt(os.path.join(output_dir, "user_latent_factors.csv"), self.H, delimiter=",")
        print(f"User latent factors saved to {os.path.join(output_dir, 'user_latent_factors.csv')}")

        # Save item latent factors (Q)
        np.savetxt(os.path.join(output_dir, "item_latent_factors.csv"), self.Q, delimiter=",")
        print(f"Item latent factors saved to {os.path.join(output_dir, 'item_latent_factors.csv')}")

        # Save user biases (o)
        np.savetxt(os.path.join(output_dir, "user_biases.csv"), self.o, delimiter=",")
        print(f"User biases saved to {os.path.join(output_dir, 'user_biases.csv')}")

        # Save item biases (p)
        np.savetxt(os.path.join(output_dir, "item_biases.csv"), self.p, delimiter=",")
        print(f"Item biases saved to {os.path.join(output_dir, 'item_biases.csv')}")

        # Save global mean (mu)
        with open(os.path.join(output_dir, "global_mean.txt"), "w") as f:
            f.write(str(self.mu))
        print(f"Global mean saved to {os.path.join(output_dir, 'global_mean.txt')}")

    def evaluate(self, threshold=3):
        """
        Evaluate the model using Precision, Recall, and F1 score.
        Args:
            threshold (int): The rating threshold to consider a "positive" prediction.
        """
        y_true = []
        y_pred = []

        # Iterate through all users and items
        for u in range(self.n_users):
            for i in range(self.n_items):
                if self.Y[u, i] > 0:  # Only evaluate on observed ratings
                    observed_rating = self.Y[u, i]
                    predicted_rating = self.predict(u, i)

                    # Convert ratings to binary (1 if positive, 0 if negative)
                    y_true.append(1 if observed_rating >= threshold else 0)
                    y_pred.append(1 if predicted_rating >= threshold else 0)

        # Compute Precision, Recall, and F1 Score
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    def export_ratings(self, output_filename="predicted_ratings.csv"):
        """
        Export the predicted and observed ratings to a CSV file.
        Args:
            output_filename (str): Name of the output CSV file.
        """
        rows = []
        
        for u in range(self.n_users):
            for i in range(self.n_items):
                if self.Y[u, i] > 0:  # Only consider observed ratings
                    observed_rating = self.Y[u, i]
                    predicted_rating = self.predict(u, i)
                    rows.append([u, i, predicted_rating, observed_rating])

        # Create DataFrame and save to CSV
        df = pd.DataFrame(rows, columns=["userId", "movieId", "predicted_ratings", "observed_ratings"])
        df.to_csv(output_filename, index=False)
        print(f"Predicted ratings saved to {output_filename}")

## Preprocessing step
Compute k_u: number of observed ratings of each users

In [5]:
def compute_user_ratings(Y, n_users):
    """
    Compute the number of observed ratings for each user.
    Args:
        Y: numpy array, shape (n_users, n_items), each element is the rating for user u and item i.
        n_users: Total number of users.
    Returns:
        k_u: numpy array, number of ratings for each user.
    """
    k_u = np.zeros(n_users, dtype=int)
    
    # Tính số lượng ratings cho mỗi user, chỉ tính những giá trị không phải 0
    for u in range(n_users):
        k_u[u] = np.sum(Y[u, :] > 0)  # Đếm số lượng rating (không phải 0) cho mỗi user
    
    return k_u


Generate groups: randomly create groups of 2/3/4 members

In [6]:
def generate_unique_groups_from_matrix(Y, group_size, n_groups=1000):
    """
    Generate unique groups of users with a given size from a rating matrix.
    Args:
        Y: numpy array, shape (n_users, n_items), the rating matrix.
        group_size: Size of each group to generate.
        n_groups: Number of groups to generate.
    Returns:
        groups: List of unique groups, where each group is a tuple of user indices.
    """
    n_users = Y.shape[0]
    groups = set()  # Use a set to ensure uniqueness
    
    while len(groups) < n_groups:
        # Generate a random group of the specified size
        group = tuple(sorted(np.random.choice(n_users, group_size, replace=False)))
        
        # Ensure the group is unique based on their ratings (only include active users)
        valid_group = True
        for user in group:
            # You can apply any condition here, for example, ensure users have at least one rating
            if np.sum(Y[user, :] > 0) == 0:  # Check if the user has at least one rating
                valid_group = False
                break
        
        if valid_group:
            groups.add(group)
    
    # Convert the set to a list for consistency in output format
    return list(groups)


## Offline
Compute $c^{[\beta,1]}_{u,i}$

In [7]:
def compute_cui_beta(u, i, real_ratings, predicted_ratings, mu, user_biases, item_biases, H, Q, beta=0.4):
    """
    Compute the weight c_{u,i} for user u and item i, with normalization.
    
    Args:
        u: User index.
        i: Item index.
        real_ratings: 2D array (user, item) for real ratings (NaN if missing).
        predicted_ratings: 2D array (user, item) for predicted ratings.
        mu: Global average rating.
        user_biases: Array of user biases.
        item_biases: Array of item biases.
        H: User latent factors (n_users x K).
        Q: Item latent factors (n_items x K).
        beta: The lower bound for normalization.
    
    Returns:
        c_u_i: Normalized weight [beta, 1].
    """
    # Get the real rating (if available) and predicted rating
    r_ui = real_ratings[u, i] if not np.isnan(real_ratings[u, i]) else None
    pred_ui = predicted_ratings[u, i]
    
    # Calculate the absolute difference based on the real rating vs predicted rating
    if r_ui is not None:  # If the rating is available
        abs_diff = abs(r_ui - pred_ui)
    else:  # If the rating is missing (use predicted rating)
        abs_diff = abs(mu + user_biases[u] + item_biases[i] - np.dot(H[u], Q[i]))
    
    # Normalize the absolute difference to [0, 1]
    c_u_i = 1 - abs_diff
    c_u_i = np.clip(c_u_i, 0, 1)  # Ensure it's within [0, 1]
    
    # Normalization to the range [beta, 1] for the group
    min_c = np.min(c_u_i)  # Replace with the minimum value of c_u_i in the group (G)
    max_c = np.max(c_u_i)  # Replace with the maximum value of c_u_i in the group (G)
    
    c_u_i_normalized = beta + (1 - beta) * (c_u_i - min_c) / (max_c - min_c)
    c_u_i_normalized = np.clip(c_u_i_normalized, beta, 1)  # Ensure it's within [beta, 1]
    
    return c_u_i_normalized

Compute $k^{[\beta,1]}_{u}$

In [8]:
def compute_k_u_beta(k_u, beta=0.4):
    """
    Compute the number of actual ratings for each users with normalization.
    Args:
        k_u: Array of number of ratings per user.
        beta: Minimum bound for \( k_u \) normalization (default 0.4).
    Returns:
         k_u: Array of number of ratings per user normalized.
    """
    k_min = np.min(k_u)
    k_max = np.max(k_u)
    k_u_normalized = beta + (1 - beta) * (k_u - k_min) / (k_max - k_min) if k_max > k_min else np.full_like(k_u, beta)
    return k_u_normalized

Aggragating Profile use AOFRAM & W

In [13]:

def compute_group_profile(group, item, real_ratings, predicted_ratings, H, Q, k_u_normalized, c_ui_normalized, beta=0.4):
    """
    Compute the aggregated group profile for a given item using normalization.

    Args:
        group: List of user IDs in the group.
        item: Target item ID.
        real_ratings: 2D array (user, item) for real ratings (nan if missing).
        predicted_ratings: 2D array (user, item) for predicted ratings.
        H: User latent factors (n_users x K).
        Q: Item latent factors (n_items x K).
        k_u_normalized: Array of number of ratings per user.
        c_ui_normalized: Normalized weight [beta, 1].
        beta: Minimum bound for \( k_u \) normalization (default 0.4).

    Returns:
        r_v_i: Aggregated group profile rating for the item (or None if no real ratings exist).
    """
    # Identify real ratings for the group on the target item
    real_group_ratings = [real_ratings[u, item] for u in group]
    has_real_rating = any(not np.isnan(r) for r in real_group_ratings)

    # If no real rating exists, return None
    if not has_real_rating:
        return None

    # Compute weighted sums
    for idx, u in enumerate(group):
        c_u_i = c_ui_normalized[idx]
        k_u_component = k_u_normalized[u]

        if not np.isnan(real_ratings[u, item]):  # Real rating exists
            s_u_i = real_ratings[u, item]
        else:  # Use predicted rating
            s_u_i = predicted_ratings[u, item]

        # Compute weight
        weight = k_u_component * (c_u_i ** 0.4)

        # Accumulate weighted sums
        numerator += weight * s_u_i
        denominator += weight

    # Compute aggregated profile rating
    r_v_i = numerator / denominator if denominator > 0 else None
    return r_v_i


def save_group_profiles(groups, real_ratings, predicted_ratings, user_biases, item_biases, mu, H, Q, k_u, output_file="group_profiles.csv"):
    """
    Generate and save group profiles into a CSV file.
    
    Args:
        groups: Dictionary containing groups for different sizes.
        real_ratings: 2D array (user, item) for real ratings (NaN if missing).
        predicted_ratings: 2D array (user, item) for predicted ratings.
        user_biases: Array of user biases.
        item_biases: Array of item biases.
        mu: Global average rating.
        H: User latent factors (n_users x K).
        Q: Item latent factors (n_items x K).
        k_u: Array of number of ratings per user.
        output_file: Path where to save the group profiles CSV.
    """
    group_profiles = []

    # Loop through each group size
    for size, group_list in groups.items():
        for group_id, group in enumerate(group_list):
            # Loop through each item
            for item in range(real_ratings.shape[1]):
                # Compute the group profile rating for this group and item
                r_v_i = compute_group_profile(group, item, real_ratings, predicted_ratings, user_biases, item_biases, mu, H, Q, k_u)
                
                # If we computed a valid profile, add it to the list
                if r_v_i is not None:
                    group_profiles.append([group_id, group, item, r_v_i])
    
    # Save the group profiles to a CSV file
    with open(output_file, "w", newline="") as f:
        writer = csv.writer(f)
        # Write header
        writer.writerow(["Group_ID", "Group_Members", "Item_ID", "Aggregated_Rating"])
        
        # Write the group profiles
        for profile in group_profiles:
            writer.writerow(profile)

    print(f"Group profiles saved to {output_file}")


## Test the results
RMSE

In [80]:
# Load data
data_path = '../data/ml-latest-small/ratings.csv'
data = pd.read_csv(data_path)

# Pivot to summarise and count
data = data.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)

# Split data into train and test
train_data, test_data = train_test_split(data, test_size=None, random_state=42)
rate_train = train_data.to_numpy()
rate_test = test_data.to_numpy()


# Train the model
mf = MFOptimized(rate_train, K, lam, learning_rate, max_iter, print_every, tolerance)
# # mf.load("data/input")
# mf.fit()
# # Evaluate the model
# mf.evaluate()
# mf.export_ratings("./data/output/predicted_ratings.csv")
# # print(f"\nOptimized MF, RMSE: {rmse:.4f}")
# mf.export_latent_matrices_and_biases("data/outut")

(457, 60)
(9724, 60)


(610, 9724)


Generate a prediction for a user

In [22]:
def generate_predictions_for_user(model, user_id, n_items):
    predictions = [(user_id + 1, item + 1, model.predict(user_id, item)) for item in range(n_items)]
    return predictions

Make comparision of real ratings and model's predicted ratings for a user

In [23]:
def create_comparison_csv(model, user_id, data, n_items, output_path):
    user_ratings = data[data['user_id'] == user_id + 1][['user_id', 'movie_id', 'rating']]
    predictions = generate_predictions_for_user(model, user_id, n_items)
    predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
    predictions_df['predicted_rating'] = predictions_df['predicted_rating'].round(2)
    comparison_df = pd.merge(user_ratings, predictions_df, on=['user_id', 'movie_id'])
    comparison_df.to_csv(output_path, index=False)
    print(f"Comparison CSV saved to {output_path}")


Test it

In [24]:
user_id = 100

n_items = mf.n_items
predictions = generate_predictions_for_user(mf, user_id, n_items)
predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
predictions_output_path = f'./data/output/predicted_ratings_user_{user_id}.csv'
predictions_df.to_csv(predictions_output_path, index=False)
print(f'Predicted ratings for user {user_id + 1} saved to {predictions_output_path}')

comparison_output_path = f'./data/output/rating_comparison_user_{user_id}.csv'
create_comparison_csv(mf, user_id, data, n_items, comparison_output_path)
mf.export_latent_matrices_and_biases()

IndexError: index 193609 is out of bounds for axis 0 with size 193609