# Pre-Process data for CF

+ Read and Merge Data: Load the ratings.dat, users.dat, and movies.dat files, and merge them into a single DataFrame.
+ Filter Users with ≥ 5 Ratings: Filter out users who have less than 5 ratings.
+ Calculate Pearson Correlation and Find Valid Neighbors: Use the pearson_correlation function to calculate the Pearson Correlation Coefficient matrix and then identify users who have valid neighbors based on a threshold.


In [17]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import os
import sys
# Add the path to the constants file to the system path
sys.path.append('../../../')
from path_utils import *

# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, '../data')
print(f"Data directory: {DATA_DIR}")

# data path
movies_path = os.path.join(DATA_DIR, 'ml-1m/movies.dat')
print(f'Data path: {movies_path}')
ratings_path = os.path.join(DATA_DIR, 'ml-1m/ratings.dat')
print(f'Data path: {ratings_path}')
users_path = os.path.join(DATA_DIR, 'ml-1m/users.dat')
print(f'Data path: {users_path}')

data_path = os.path.join(DATA_DIR, 'ml-1m/merged_data.dat')
print(f'Data path: {data_path}')

Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/notebook
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/notebook/../data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/notebook/../data/ml-1m/movies.dat
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/notebook/../data/ml-1m/ratings.dat
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/notebook/../data/ml-1m/users.dat
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/notebook/../data/ml-1m/merged_data.dat


In [21]:

# Step 1: Read and Merge Data
def load_and_merge_data(movies_path, ratings_path, users_path):
    # Load each file
    movies = pd.read_csv(movies_path, delimiter='::', engine= 'python', header=None, names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
    ratings = pd.read_csv(ratings_path, delimiter='::', engine= 'python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
    users = pd.read_csv(users_path,delimiter='::', engine= 'python', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

    # Merge datasets
    merged_data = pd.merge(pd.merge(ratings, users, on='UserID'), movies, on='MovieID')
    return merged_data

# Step 2: Filter Users with ≥ 5 Ratings
def filter_users(data):
    user_rating_counts = data['UserID'].value_counts()
    valid_users = user_rating_counts[user_rating_counts >= 5].index.tolist()
    return data[data['UserID'].isin(valid_users)]

# Step 3: Calculate Pearson Correlation

# source RMIT courses
def pearson_correlation(interaction_matrix):
    """
    Compute the Pearson Correlation Coefficient matrix for the user-item interaction matrix.

    Args:
    interaction_matrix (csr_matrix): A sparse matrix where rows represent users and columns represent items.
                                     The values in the matrix are the ratings given by users to items.

    Returns:
    numpy.ndarray: A 2D array representing the Pearson Correlation Coefficients between each pair of users.
    """
    # Convert sparse matrix to dense format for processing
    dense_matrix = interaction_matrix.toarray()
    
    # Get the number of users
    n_users = dense_matrix.shape[0]

    # Initialize the Pearson Correlation matrix
    pearson_corr_matrix = np.zeros((n_users, n_users))

    # Small constant to avoid division by zero
    EPSILON = 1e-9

    # Iterate over each pair of users
    for i in range(n_users):
        for j in range(n_users):
            # Get the rating vectors for the current pair of users
            user_i_vec = dense_matrix[i, :]
            user_j_vec = dense_matrix[j, :]

            # Masks for rated items
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # Find indices of corrated items
            corrated_index = np.intersect1d(np.where(mask_i)[0], np.where(mask_j)[0])

            # Skip if no items are corrated
            if len(corrated_index) == 0:
                continue

            # Compute the mean rating for each user over corrated items
            mean_user_i = np.mean(user_i_vec[corrated_index])
            mean_user_j = np.mean(user_j_vec[corrated_index])

            # Compute the deviations from the mean
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            # Calculate the components for Pearson correlation
            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            # Calculate Pearson correlation
            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

            # Store the similarity in the matrix
            pearson_corr_matrix[i, j] = sim

    return pearson_corr_matrix


# Step 3: Find Valid Neighbors
def get_valid_neighbors(pcc_matrix, threshold=0.6):
    valid_neighbors = {}
    for i, row in enumerate(pcc_matrix):
        valid_neighbors[i] = np.where(row > threshold)[0]
    return valid_neighbors



# Load and merge data
data = load_and_merge_data(movies_path, ratings_path, users_path)

# Filter users with at least 5 ratings
filtered_data = filter_users(data)

# Create User-Item Interaction Matrix
interaction_matrix = pd.pivot_table(filtered_data, index='UserID', columns='MovieID', values='Rating').fillna(0)
csr_interaction_matrix = csr_matrix(interaction_matrix.values)

# Calculate Pearson Correlation Coefficient Matrix
pcc_matrix = pearson_correlation(csr_interaction_matrix)

# Find Valid Neighbors
valid_neighbors = get_valid_neighbors(pcc_matrix)

# Print some summary
print(f"Total users after filtering: {len(filtered_data['UserID'].unique())}")
print(f"Total users with valid neighbors: {len(valid_neighbors)}")


In [None]:
# saved the merged data in data_path
data.to_csv(data_path, index=False)
print(f"Saved merged data in {data_path}")