In [1]:
import random
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openai
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'amazon-beauty/large_merged_data.csv')
print(f'Data path: {data_path}')

# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/large_CF_output_path_2_ratings_per_user.csv')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/rerun_large_CF_output_path_2_ratings_per_user.csv')
print(f'Data path: {data_path}')

Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv


In [2]:
# Read and Merge Data
def load_and_merge_data(movies_path, ratings_path, users_path):
    # Load each file
    movies = pd.read_csv(movies_path, delimiter='::', engine= 'python', header=None, names=['asin', 'title', 'Genres'], encoding='ISO-8859-1')
    ratings = pd.read_csv(ratings_path, delimiter='::', engine= 'python', header=None, names=['reviewerID', 'asin', 'rating', 'Timestamp'], encoding='ISO-8859-1')
    users = pd.read_csv(users_path,delimiter='::', engine= 'python', header=None, names=['reviewerID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')
    # Merge datasets
    merged_data = pd.merge(pd.merge(ratings, users, on='reviewerID'), movies, on='asin')
    return merged_data


# Filter Users with ≥ 5 ratings
def filter_users(data):
    user_rating_counts = data['reviewerID'].value_counts()
    valid_users = user_rating_counts[user_rating_counts >= 5].index.tolist()
    return data[data['reviewerID'].isin(valid_users)]


# Calculate Pearson Correlation Coefficient
# source RMIT courses
def pearson_correlation(interaction_matrix):
    """
    Compute the Pearson Correlation Coefficient matrix for the user-item interaction matrix.

    Args:
    interaction_matrix (csr_matrix): A sparse matrix where rows represent users and columns represent items.
                                     The values in the matrix are the ratings given by users to items.

    Returns:
    numpy.ndarray: A 2D array representing the Pearson Correlation Coefficients between each pair of users.
    """
    # Convert sparse matrix to dense format for processing
    dense_matrix = interaction_matrix.toarray()
    
    # Get the number of users
    n_users = dense_matrix.shape[0]

    # Initialize the Pearson Correlation matrix
    pearson_corr_matrix = np.zeros((n_users, n_users))

    # Small constant to avoid division by zero
    EPSILON = 1e-9

    # Iterate over each pair of users
    for i in range(n_users):
        for j in range(n_users):
            # Get the rating vectors for the current pair of users
            user_i_vec = dense_matrix[i, :]
            user_j_vec = dense_matrix[j, :]

            # Masks for rated items
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # Find indices of corrated items
            corrated_index = np.intersect1d(np.where(mask_i)[0], np.where(mask_j)[0])

            # Skip if no items are corrated
            if len(corrated_index) == 0:
                continue

            # Compute the mean rating for each user over corrated items
            mean_user_i = np.mean(user_i_vec[corrated_index])
            mean_user_j = np.mean(user_j_vec[corrated_index])

            # Compute the deviations from the mean
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            # Calculate the components for Pearson correlation
            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            # Calculate Pearson correlation
            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

            # Store the similarity in the matrix
            pearson_corr_matrix[i, j] = sim

    return pearson_corr_matrix


# Find Valid Neighbors
def get_valid_neighbors(pcc_matrix, threshold=0.6):
    valid_neighbors = {}
    for i, row in enumerate(pcc_matrix):
        valid_neighbors[i] = np.where(row > threshold)[0]
    return valid_neighbors

Adding a timestamp to the function, particularly in the line main_user_ratings = main_user_data.nlargest(num_main_user_ratings, 'Timestamp'), serves a specific purpose in the context of collaborative filtering for recommendation systems. Here's the breakdown:

Freshness of Data: By selecting ratings based on the most recent timestamp, the function prioritizes the latest user preferences. People's tastes can change over time, so using the most recent data ensures that the recommendations reflect current interests.

Temporal Relevance: In many scenarios, the relevance of a rating can be time-sensitive, especially in fast-moving domains like movies, music, or fashion. A timestamp helps to capture this aspect, ensuring that the recommendations are not only based on the highest ratings but also on the most timely and relevant ones.

Predictive Accuracy: When predicting future ratings or preferences, recent behavior is often a better indicator than older data. By focusing on the latest interactions, the model is likely to be more accurate in predicting current or future preferences.

User Behavior Analysis: Analyzing user behavior over time can provide valuable insights. By observing how a user's ratings evolve, the model can detect trends and shifts in preferences, which can be crucial for improving recommendation quality.

In summary, the inclusion of the timestamp in selecting user ratings is a strategic choice to enhance the relevance, timeliness, and accuracy of the recommendations generated by the collaborative filtering algorithm.

In [39]:
import random

def predict_ratings_with_collaborative_filtering_and_save(data, pcc_matrix, 
                                                          user_column_name='reviewerID', 
                                                          movie_column_name='title', 
                                                          movie_id_column='asin',
                                                          rating_column_name='rating', 
                                                          num_ratings_per_user=1, 
                                                          num_similar_users=4,
                                                          num_main_user_ratings=1,
                                                          save_path='cf_predictions.csv', 
                                                          seed=RANDOM_STATE):
    results = []
    unique_users = data[user_column_name].unique()
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}

    random.seed(seed)

    for user_id in unique_users:
        user_idx = user_id_to_index[user_id]

        print(f"Processing user {user_id} (Index: {user_idx})")

        # Retrieve the main user's historical ratings
        main_user_data = data[data[user_column_name] == user_id]
        if len(main_user_data) > num_main_user_ratings:
            main_user_ratings = main_user_data.sample(n=num_main_user_ratings, random_state=seed)
        else:
            main_user_ratings = main_user_data

        main_user_ratings_str = '\n'.join([
            f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
            for _, row in main_user_ratings.iterrows()
        ])

        # Find the top similar users based on Pearson Correlation Coefficient
        similar_users_idx = np.argsort(-pcc_matrix[user_idx])[:num_similar_users + 1]
        similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

        print(f"Top similar users for {user_id}: {[unique_users[idx] for idx in similar_users_idx]}")

        # Collect historical ratings from similar users
        similar_users_ratings = ""
        for idx in similar_users_idx:
            similar_user_id = unique_users[idx]
            similar_user_data = data[data[user_column_name] == similar_user_id]
            if len(similar_user_data) > num_ratings_per_user:
                historical_ratings = similar_user_data.sample(n=num_ratings_per_user, random_state=seed)
            else:
                historical_ratings = similar_user_data

            for _, row in historical_ratings.iterrows():
                rating_info = f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
                similar_users_ratings += rating_info + "\n"

        # Select a random movie from the user's data for prediction
        random_movie_row = main_user_data.sample(n=1, random_state=seed).iloc[0]
        random_movie_title = random_movie_row[movie_column_name]
        random_movie_id = random_movie_row[movie_id_column]
        actual_rating = random_movie_row[rating_column_name]

        # Construct prompt for API call
        combined_text = f"Title: {random_movie_title}"
        prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"

        print(f"Generated prompt for user {user_id}:\n{prompt}")

        predicted_rating = predict_rating_combined_ChatCompletion(
            combined_text, 
            approach="CF", 
            similar_users_ratings=similar_users_ratings,
            rating_history=main_user_ratings_str
        )

        # Store prediction results
        results.append([user_id, random_movie_id, random_movie_title, actual_rating, predicted_rating])

        print(f"User {user_id}: Predicted rating for '{random_movie_title}' is {predicted_rating}.")

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")

    return results_df


In [40]:
data = pd.read_csv(data_path)
data.head(3)

Unnamed: 0,rating,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price
0,1.0,False,2015-08-25,A2RYSCZOPEXOCQ,9790787006,The Cat Next Door,"I use a lot of perfume, I go through a new bot...",This is not going to be my favorite scent.,2015-08-25,,...,,Jenna Jameson,[],298.0,"['B00357FTX8', 'B01NBID7FJ', 'B0017JT658']","{'Shipping Weight:': '12.8 ounces (', 'ASIN: '...",All Beauty,,,13.85
1,5.0,False,2001-06-08,A141OPVE376YFI,B000050B65,Paul G.,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",2001-06-08,81.0,...,,Norelco,[],2.0,"['B01B1O9DOM', 'B00JITDVD2', 'B01KXV16DK', 'B0...",{},All Beauty,,,
2,5.0,False,2001-06-08,A141OPVE376YFI,B000050B65,Paul G.,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",2001-06-08,81.0,...,,Norelco,[],2.0,"['B01B1O9DOM', 'B00JITDVD2', 'B01KXV16DK', 'B0...",{},All Beauty,,,


In [6]:
# Create User-Item Interaction Matrix
interaction_matrix = pd.pivot_table(data, index='reviewerID', columns='asin', values='rating').fillna(0)
csr_interaction_matrix = csr_matrix(interaction_matrix.values)

# Calculate Pearson Correlation Coefficient Matrix
pcc_matrix = pearson_correlation(csr_interaction_matrix)

pcc_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [7]:
csr_interaction_matrix

<1608x1879 sparse matrix of type '<class 'numpy.float64'>'
	with 7469 stored elements in Compressed Sparse Row format>

In [8]:
interaction_matrix

asin,9790787006,B000050B63,B000050B65,B000050B6B,B000050B6H,B000050FDT,B000050FDY,B000052YAN,B000052YD8,B0000530HU,...,B01H4Y9MSU,B01H640HTG,B01H71ND58,B01H71ND76,B01HATTFWW,B01HB4BS1C,B01HBWYB5Y,B01HBYF0CK,B01HD23OJG,B01HIPOQ2M
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A105A034ZG9EHO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10JB7YPWZGRF4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10M2MLE2R0L6K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10OYW0QYN13GL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10P0NAKKRYKTZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZMAOC6QC0WEP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZPI1JA9XKV8P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZQZIAWSFBHLW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZRD4IZU6TBFV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Convert 'unixReviewTime' to integers
data['unixReviewTime'] = pd.to_numeric(data['unixReviewTime'], errors='coerce')

In [41]:
%%time

cf_predictions = predict_ratings_with_collaborative_filtering_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       num_ratings_per_user=1,
                                                                       num_main_user_ratings=4,
                                                                       num_similar_users=4,
                                                                       )


Processing user A2RYSCZOPEXOCQ (Index: 0)
Top similar users for A2RYSCZOPEXOCQ: ['A4DEEDXZK8L78', 'A3JNP9PGF2DMIO', 'A1DFZPQPCHBYTY', 'A177B2VPWX4P55']
Generated prompt for user A2RYSCZOPEXOCQ:
Main User Ratings:
* Title: Kordon Oasis (Novalek) Bell Bottle 8oz, Rating: 5.0 stars
* Title: Caboodles Heart Throb Long Tapered Tote, Black Diamond, 1.12 Pound, Rating: 5.0 stars
* Title: Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray, Rating: 1.0 stars
* Title: Kordon Oasis (Novalek) Bell Bottle 8oz, Rating: 5.0 stars

Similar Users' Ratings:
* Title: Bath &amp; Body Works Ile De Tahiti Moana Coconut Vanille Moana Body Wash with Tamanoi 8.5 oz, Rating: 5.0 stars
* Title: Bath &amp; Body Works Ile De Tahiti Moana Coconut Vanille Moana Body Wash with Tamanoi 8.5 oz, Rating: 5.0 stars
* Title: Bath &amp; Body Works Ile De Tahiti Moana Coconut Vanille Moana Body Wash with Tamanoi 8.5 oz, Rating: 5.0 stars
* Title: Bonne Bell Smackers Bath and Body Starburst Collection, Ra

In [42]:

data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
data['is_rating_float'] = pd.to_numeric(data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = data[data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


Original Data Types:
user_id              object
item_id              object
title                object
actual_rating       float64
predicted_rating     object
dtype: object


Total number of rows with non-float ratings: 1608
Rows with non-float ratings:


Unnamed: 0,user_id,item_id,title,actual_rating,predicted_rating,is_rating_float
0,A2RYSCZOPEXOCQ,B0002564EE,Kordon Oasis (Novalek) Bell Bottle 8oz,5.0,"(None, ""Error communicating with OpenAI: HTTPS...",False
1,A141OPVE376YFI,B000050FDY,Braun Clean &amp; Renew Refill Cartridges CCR ...,5.0,"(None, ""Error communicating with OpenAI: HTTPS...",False
2,A1TVTDKNMSQ7XU,B000068PBO,Norelco 5810XL Reflex Action Shaving System,5.0,"(None, ""Error communicating with OpenAI: HTTPS...",False


In [43]:
def identify_and_rerun_failed_cf_predictions(data, 
                                             pcc_matrix, 
                                             user_column_name='user_id', 
                                             movie_column_name='title', 
                                             movie_id_column='item_id',
                                             rating_column_name='actual_rating', 
                                             num_ratings_per_user=1, 
                                             num_similar_users=4, 
                                             save_path='cf_predictions.csv', 
                                             rerun_save_path='cf_rerun_predictions.csv', 
                                             seed=RANDOM_STATE):
    """
    Identify failed predictions in CF data and rerun them.

    Args:
    - data: DataFrame containing the original CF predictions.
    - pcc_matrix: Pearson Correlation Coefficient matrix.
    - Other arguments for controlling various aspects of the prediction function.

    Returns:
    - Updated DataFrame with rerun predictions.
    """
    # Ensure the original data has the necessary columns
    if rating_column_name not in data.columns:
        raise KeyError(f"Column '{rating_column_name}' not found in the data.")

    # Identify rows with failed predictions
    failed_rows = data[pd.to_numeric(data['predicted_rating'], errors='coerce').isna()]

    if len(failed_rows) > 0:
        print(f"Re-running predictions for {len(failed_rows)} failed cases.")

        # Filter failed rows to include only users that are in the PCC matrix
        unique_users = set(data[user_column_name].unique())
        valid_users = set(range(pcc_matrix.shape[0]))  # Assuming user indices match PCC matrix rows
        valid_failed_rows = failed_rows[failed_rows[user_column_name].apply(lambda x: x in unique_users and x in valid_users)]

        if len(valid_failed_rows) > 0:
            # Call prediction function on valid failed data
            rerun_data = predict_ratings_with_collaborative_filtering_and_save(
                valid_failed_rows, pcc_matrix,
                user_column_name=user_column_name,
                movie_column_name=movie_column_name,
                movie_id_column=movie_id_column,
                rating_column_name=rating_column_name,
                num_ratings_per_user=num_ratings_per_user,
                num_similar_users=num_similar_users,
                save_path=rerun_save_path,
                seed=seed
            )

            # Update original data with new predictions
            data.update(rerun_data.set_index('user_id')['predicted_rating'])
        else:
            print("No valid failed cases to re-run.")

    # Save the updated data
    data.to_csv(save_path, index=False)
    print(f"Updated predictions saved to {save_path}")

    return data


In [44]:
%%time 

# Load the original CF predictions
cf_data = pd.read_csv(CF_OUTPUT_PATH)


# Identify and rerun failed predictions
updated_cf_data = identify_and_rerun_failed_cf_predictions(
    cf_data, pcc_matrix,
    save_path=CF_OUTPUT_PATH,
    rating_column_name='actual_rating', 
    rerun_save_path=CF_RERUN_PATH
)



Re-running predictions for 1608 failed cases.
No valid failed cases to re-run.
Updated predictions saved to /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/output/large_CF_output_path_2_ratings_per_user.csv
CPU times: user 26.6 ms, sys: 2.61 ms, total: 29.2 ms
Wall time: 27.8 ms


In [45]:
data = pd.read_csv(CF_RERUN_PATH)

data

FileNotFoundError: [Errno 2] No such file or directory: '/Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/output/rerun_large_CF_output_path_2_ratings_per_user.csv'

In [26]:
# Evaluate CF Model
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

TypeError: unsupported operand type(s) for -: 'float' and 'str'