In [24]:
import random
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openai
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'amazon-beauty/large_merged_data.csv')
print(f'Data path: {data_path}')

# output

CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_OUTPUT_PATH}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/rerun_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_RERUN_PATH}')


# Constants for column names
USER_COLUMN_NAME = 'reviewerID'
TITLE_COLUMN_NAME = 'title'
ITEM_ID_COLUMN = 'asin'
RATING_COLUMN_NAME = 'rating'
TIME_STAMP_COLUMN_NAME = 'unixReviewTime'

# num_ratings_per_user
NUM_RATINGS_PER_USER = 1
# num_main_user_ratings
NUM_MAIN_USER_RATINGS = 4
# num_similar_users
NUM_SIMILAR_USERS = 4

SYSTEM_CONTENT = AMAZON_CONTENT_SYSTEM


Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/large_merged_data.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/output/large_CF_fewshot_output_path_ratings_per_user.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/output/rerun_large_CF_fewshot_output_path_ratings_per_user.csv


In [25]:
CF_OUTPUT_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_timestamp/timestamp_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_OUTPUT_TIMESTAMP_PATH}')

CF_RERUN_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_timestamp/rerun_timestamp_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_RERUN_TIMESTAMP_PATH}')

Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/output/split_timestamp/timestamp_large_CF_fewshot_output_path_ratings_per_user.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/output/split_timestamp/rerun_timestamp_large_CF_fewshot_output_path_ratings_per_user.csv


In [26]:
data = pd.read_csv(data_path)
data.head(3)

Unnamed: 0,rating,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price
0,1.0,False,2015-08-25,A2RYSCZOPEXOCQ,9790787006,The Cat Next Door,"I use a lot of perfume, I go through a new bot...",This is not going to be my favorite scent.,2015-08-25,,...,,Jenna Jameson,[],298.0,"['B00357FTX8', 'B01NBID7FJ', 'B0017JT658']","{'Shipping Weight:': '12.8 ounces (', 'ASIN: '...",All Beauty,,,13.85
1,5.0,False,2001-06-08,A141OPVE376YFI,B000050B65,Paul G.,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",2001-06-08,81.0,...,,Norelco,[],2.0,"['B01B1O9DOM', 'B00JITDVD2', 'B01KXV16DK', 'B0...",{},All Beauty,,,
2,5.0,True,2008-07-25,A1TVTDKNMSQ7XU,B000050B6B,Grandpa Pipes,I've had many Norelco razors in my 50 years of...,Just like new.....,2008-07-25,,...,,Philips Norelco,[],148.0,"['B001IA0PCY', 'B00196W5S4', 'B004URZADG', 'B0...",{'\n Product Dimensions: \n ': '5.1 x 0....,All Beauty,,,64.5


In [27]:
# Create User-Item Interaction Matrix
interaction_matrix = pd.pivot_table(data, index=USER_COLUMN_NAME, columns=ITEM_ID_COLUMN, values=RATING_COLUMN_NAME).fillna(0)
csr_interaction_matrix = csr_matrix(interaction_matrix.values)

interaction_matrix

asin,9790787006,B000050B63,B000050B65,B000050B6B,B000050B6H,B000050FDT,B000050FDY,B000052YAN,B000052YD8,B0000530HU,...,B01H4Y9MSU,B01H640HTG,B01H71ND58,B01H71ND76,B01HATTFWW,B01HB4BS1C,B01HBWYB5Y,B01HBYF0CK,B01HD23OJG,B01HIPOQ2M
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A105A034ZG9EHO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10JB7YPWZGRF4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10M2MLE2R0L6K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10OYW0QYN13GL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10P0NAKKRYKTZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZMAOC6QC0WEP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZPI1JA9XKV8P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZQZIAWSFBHLW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZRD4IZU6TBFV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
csr_interaction_matrix

<1608x1879 sparse matrix of type '<class 'numpy.float64'>'
	with 7469 stored elements in Compressed Sparse Row format>

In [29]:
# %%
# Compute the user-user Pearson Correlation Coefficient Matrix
user_pcc_matrix = pearson_correlation(csr_interaction_matrix)
print(f'User PCC Matrix:\n{user_pcc_matrix}\n')



User PCC Matrix:
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]



In [30]:
# Compute the item-item Pearson Correlation Coefficient Matrix
# Assuming the function 'item_pearson_correlation' takes a dense matrix as input.
# If it still takes a csr_matrix, then convert it inside the function.
dense_interaction_matrix = csr_interaction_matrix.toarray()

item_pcc_matrix = item_pearson_correlation(dense_interaction_matrix.T)
print(f'Item PCC Matrix:\n{item_pcc_matrix}\n')


Starting item-item Pearson Correlation computation...
Item-item Pearson Correlation computation completed.
Item PCC Matrix:
[[2.80000000e-01 6.66666701e-02 0.00000000e+00 ... 0.00000000e+00
  5.81818214e-02 0.00000000e+00]
 [6.66666701e-02 1.00000016e-09 0.00000000e+00 ... 0.00000000e+00
  8.00000128e-10 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.40000000e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 6.40000104e-10
  0.00000000e+00 0.00000000e+00]
 [5.81818214e-02 8.00000128e-10 0.00000000e+00 ... 0.00000000e+00
  1.00000016e-09 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 2.00000000e-01]]



# Split Random

In [39]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='random',
    save_path=CF_OUTPUT_PATH, 
    seed=RANDOM_STATE
)



Processing user A2RYSCZOPEXOCQ (Index: 0)
Constructed Prompt for CF approach:

The prompt:
**********


Here is user rating history:
* Title: Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray, Rating: 1.0 stars
* Title: OZNaturals Anti Aging Retinol Serum -The Most Effective Anti Wrinkle Serum Contains Professional Strength Retinol+ Astaxanthin+ Vitamin E - Get The Dramatic Youthful Results You&rsquo;ve Been Looking For, Rating: 4.0 stars
* Title: Kordon Oasis (Novalek) Bell Bottle 8oz, Rating: 5.0 stars

Here is the rating history from users who are similar to this user:
* Title: Philips Norelco HQ110 Shaving Head Cleaning Spray, Rating: 5.0 stars
* Title: Citre Shine Moisture Burst Shampoo - 16 fl oz, Rating: 4.0 stars
* Title: Yardley By Yardley Of London Unisexs Lay It On Thick Hand &amp; Foot Cream 5.3 Oz, Rating: 5.0 stars
* Title: &quot;BAD ASS&quot; Masculine Pheromone Cologne with the &quot;ADRENALINE&quot; Fragrance From SpellboundRX - The Intelligent Ph

In [4]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

RMSE: 0.7055 (95% CI: (0.6435, 0.7674)) ± 0.0006
MAE: 0.3354 (95% CI: (0.3042, 0.3673)) ± 0.0003

First few actual vs predicted ratings:
Actual: 5.0, Predicted: 4.0000
Actual: 4.0, Predicted: 4.5000
Actual: 5.0, Predicted: 5.0000
Actual: 5.0, Predicted: 4.5000
Actual: 5.0, Predicted: 4.5000


(0.7054917828828716, 0.3353780906148867)

# Split by Timestamp

In [13]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save_sequential(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='sequential',
    save_path=CF_OUTPUT_TIMESTAMP_PATH, 
    seed=RANDOM_STATE
)



Processing user A2RYSCZOPEXOCQ (Index: 0)
Constructed Prompt for CF approach:

The prompt:
**********


Here is user rating history:
* Title: Kordon Oasis (Novalek) Bell Bottle 8oz, Rating: 5.0 stars
* Title: Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray, Rating: 1.0 stars
* Title: OZNaturals Anti Aging Retinol Serum -The Most Effective Anti Wrinkle Serum Contains Professional Strength Retinol+ Astaxanthin+ Vitamin E - Get The Dramatic Youthful Results You&rsquo;ve Been Looking For, Rating: 4.0 stars

Here is the rating history from users who are similar to this user:
* Title: Philips Norelco HQ110 Shaving Head Cleaning Spray, Rating: 5.0 stars
* Title: Citre Shine Moisture Burst Shampoo - 16 fl oz, Rating: 4.0 stars
* Title: Yardley By Yardley Of London Unisexs Lay It On Thick Hand &amp; Foot Cream 5.3 Oz, Rating: 5.0 stars
* Title: &quot;BAD ASS&quot; Masculine Pheromone Cologne with the &quot;ADRENALINE&quot; Fragrance From SpellboundRX - The Intelligent Ph

In [15]:

# Read the data
saved_data = pd.read_csv(CF_OUTPUT_TIMESTAMP_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


Original Data Types:
user_id              object
item_id              object
title                object
actual_rating       float64
predicted_rating     object
dtype: object


Total number of rows with non-float ratings: 1
Rerun indices: [1381]
Rows with non-float ratings:


Unnamed: 0,user_id,item_id,title,actual_rating,predicted_rating,is_rating_float
1381,A3EXIZWHSNVNME,B01E7UKR38,essie Gel Couture Nail Polish,5.0,"(None, ""Request timed out: HTTPSConnectionPool...",False


In [23]:

def rerun_failed_CF_item_PCC_predictions_sequential(data, user_pcc_matrix, item_pcc_matrix,
                                         save_path, user_column_name, movie_column_name,
                                         movie_id_column, rating_column_name,
                                         num_ratings_per_user, num_main_user_ratings, num_similar_users,
                                         new_path, rerun_indices, seed=RANDOM_STATE,
                                         system_content=AMAZON_CONTENT_SYSTEM):
    # Load the original predictions
    original_data = pd.read_csv(save_path)
    original_data.columns = ['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating']

    # Re-seed for reproducibility
    random.seed(seed)

    # Map unique users and items to their indices for quick access
    unique_users = data[user_column_name].unique()
    unique_items = data[movie_id_column].unique()
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(unique_items)}

    for index in rerun_indices:
        user_id = original_data.at[index, 'user_id']
        item_id = original_data.at[index, 'item_id']
        user_idx = user_id_to_index.get(user_id)
        item_idx = item_id_to_index.get(item_id)

        if user_idx is None or item_idx is None:
            print(f"User ID: {user_id} or Item ID: {item_id} not found in index. Skipping.")
            continue

        print(f"Rerunning prediction for User ID: {user_id}, Item ID: {item_id} (Index: {index})")

        # Retrieve user's and item's data
        user_data = data[data[user_column_name] == user_id]
        item_data = data[data[movie_id_column] == item_id]

        if item_data.empty:
            print(f"Item data for ID: {item_id} not found. Skipping.")
            continue

        # Sample user's historical ratings
        if len(user_data) < num_main_user_ratings:
            main_user_ratings = user_data
        else:
            main_user_ratings = user_data.sample(n=num_main_user_ratings, random_state=seed)

        # Construct the context from the user's ratings
        main_user_ratings_str = '\n'.join([
            f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
            for _, row in main_user_ratings.iterrows()
        ])

        # Identify similar users and items
        similar_users_idx = np.argsort(-user_pcc_matrix[user_idx])[:num_similar_users + 1]
        similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

        similar_items_idx = np.argsort(-item_pcc_matrix[item_idx])[:num_similar_users + 1]
        similar_items_idx = similar_items_idx[similar_items_idx != item_idx][:num_similar_users]

        # Compile ratings from similar users and items
        similar_users_ratings = ""
        for idx in similar_users_idx:
            similar_user_id = unique_users[idx]
            similar_user_data = data[data[user_column_name] == similar_user_id]
            for _, row in similar_user_data.iterrows():
                similar_users_ratings += f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars\n"

        # Predict the rating
        combined_text = f"Title: {item_data.iloc[0][movie_column_name]}"
        prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"
        predicted_rating = predict_rating_combined_ChatCompletion(
            combined_text, approach="CF", similar_users_ratings=similar_users_ratings,
            rating_history=main_user_ratings_str, system_content=system_content
        )

        # Update the original data with the new prediction
        original_data.at[index, 'predicted_rating'] = predicted_rating
        print(f"Updated prediction for User ID: {user_id}, Item ID: {item_id}: {predicted_rating}")

    # Save the updated predictions to a new file
    original_data.to_csv(new_path, index=False)
    print(f"Updated predictions saved to {new_path}")

In [27]:
%%time

rerun_failed_CF_item_PCC_predictions_sequential(data, 
                                    user_pcc_matrix=user_pcc_matrix,  
                                    item_pcc_matrix=item_pcc_matrix,
                                    save_path=CF_OUTPUT_TIMESTAMP_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER, 
                                    num_similar_users=NUM_SIMILAR_USERS,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    new_path=CF_RERUN_TIMESTAMP_PATH,
                                    rerun_indices=rerun_indices
                                    )


Rerunning prediction for User ID: A3EXIZWHSNVNME, Item ID: B01E7UKR38 (Index: 1381)
Constructed Prompt for CF approach:

The prompt:
**********


Here is user rating history:
* Title: essie Nail Color Summer 2015 Collection., Rating: 4.0 stars
* Title: essie Gel Couture Nail Polish, Rating: 5.0 stars
* Title: Urban Spa Moisturizing Booties to Keep your Feet Smooth, Hydrated and Moisturized, Rating: 5.0 stars
* Title: Pre de Provence Artisanal French Soap Bar Enriched with Shea Butter, Quad-Milled For A Smooth &amp; Rich Lather (150 grams) - Raspberry, Rating: 5.0 stars

Here is the rating history from users who are similar to this user:
* Title: Yardley By Yardley Of London Unisexs Lay It On Thick Hand &amp; Foot Cream 5.3 Oz, Rating: 5.0 stars
* Title: Fruits &amp; Passion Blue Refreshing Shower Gel - 6.7 fl. oz., Rating: 5.0 stars
* Title: Bath &amp; Body Works Ile De Tahiti Moana Coconut Vanille Moana Body Wash with Tamanoi 8.5 oz, Rating: 5.0 stars
* Title: AXE Body Spray for Men, 

In [28]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_TIMESTAMP_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

RMSE: 0.7736 (95% CI: (0.7145, 0.8338)) ± 0.0006
MAE: 0.3858 (95% CI: (0.3543, 0.4184)) ± 0.0003

First few actual vs predicted ratings:
Actual: 5.0, Predicted: 4.0000
Actual: 4.0, Predicted: 4.0000
Actual: 5.0, Predicted: 5.0000
Actual: 5.0, Predicted: 4.5000
Actual: 5.0, Predicted: 4.5000


(0.773567350330891, 0.38576948507899356)

# CF using using Matrix Factorization


+ Preparing the user-item interaction matrix.
+ Performing matrix factorization to obtain latent factors for users and items.
+ Calculating similarities between users or items using the latent factors.
+ Selecting similar users or items based on these similarities.
+ Using the information from similar users or items to predict ratings for a given user-item pair.
+ Feeding these predictions into the OpenAI ChatCompletion API as part of a collaborative filtering approach.

In [31]:
from scipy.sparse.linalg import svds
import numpy as np

# Matrix Factorization with SVD
def matrix_factorization_SVD(interaction_matrix, k=50):
    """
    Perform matrix factorization using Singular Value Decomposition (SVD) to obtain latent factors for users and items.
    
    Args:
        interaction_matrix (numpy.ndarray): The user-item interaction matrix.
        k (int): The number of latent factors to compute.
    
    Returns:
        user_factors (numpy.ndarray): The matrix of user latent factors.
        item_factors (numpy.ndarray): The matrix of item latent factors.
    """
    # Perform SVD on the interaction matrix
    u, s, vt = svds(interaction_matrix, k=k)
    sigma = np.diag(s)
    user_factors = np.dot(u, sigma)
    item_factors = vt.T
    return user_factors, item_factors

# Similarity Scores for Users
def calculate_MF_similarity_user(user_factors):
    """
    Calculate the cosine similarity between users based on their latent factors from MF.
    
    Args:
        user_factors (numpy.ndarray): The matrix of user latent factors.
    
    Returns:
        user_similarity_matrix (numpy.ndarray): A matrix of user-user similarity scores.
    """
    # Normalize user factors to unit vectors
    norms = np.linalg.norm(user_factors, axis=1, keepdims=True)
    normalized_user_factors = user_factors / norms
    # Calculate cosine similarity
    user_similarity_matrix = np.dot(normalized_user_factors, normalized_user_factors.T)
    return user_similarity_matrix


# Similarity Scores for Users
def calculate_MF_similarity_item(item_factors):
    """
    Calculate the cosine similarity between items based on their latent factors from MF.
    
    Args:
        item_factors (numpy.ndarray): The matrix of item latent factors.
    
    Returns:
        item_similarity_matrix (numpy.ndarray): A matrix of item-item similarity scores.
    """
    # Normalize item factors to unit vectors
    norms = np.linalg.norm(item_factors, axis=1, keepdims=True)
    normalized_item_factors = item_factors / norms
    # Calculate cosine similarity
    item_similarity_matrix = np.dot(normalized_item_factors, normalized_item_factors.T)
    return item_similarity_matrix


In [32]:
import numpy as np
from scipy.sparse.linalg import svds



# # Generating a mock interaction matrix for testing
# np.random.seed(42)
# mock_interaction_matrix = np.random.rand(100, 50)  # 100 users, 50 items

# Test matrix_factorization_SVD
user_factors, item_factors = matrix_factorization_SVD(csr_interaction_matrix, k=20)

# Test calculate_MF_similarity_user
user_similarity_matrix = calculate_MF_similarity_user(user_factors)
item_similarity_matrix = calculate_MF_similarity_item(item_factors)

# Display shapes to verify outputs
# user_factors.shape, item_factors.shape, user_similarity_matrix.shape
(user_factors.shape, item_factors.shape, user_similarity_matrix.shape, item_similarity_matrix.shape)




((1608, 20), (1879, 20), (1608, 1608), (1879, 1879))

In [33]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save_sequential(
    data=data, 
    user_pcc_matrix=user_similarity_matrix, 
    item_pcc_matrix=item_similarity_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='sequential',
    save_path=CF_OUTPUT_TIMESTAMP_PATH, 
    seed=RANDOM_STATE
)



Processing user A2RYSCZOPEXOCQ (Index: 0)
Constructed Prompt for CF approach:

The prompt:
**********


Here is user rating history:
* Title: Kordon Oasis (Novalek) Bell Bottle 8oz, Rating: 5.0 stars
* Title: Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray, Rating: 1.0 stars
* Title: OZNaturals Anti Aging Retinol Serum -The Most Effective Anti Wrinkle Serum Contains Professional Strength Retinol+ Astaxanthin+ Vitamin E - Get The Dramatic Youthful Results You&rsquo;ve Been Looking For, Rating: 4.0 stars

Here is the rating history from users who are similar to this user:
* Title: Citre Shine Moisture Burst Shampoo - 16 fl oz, Rating: 4.0 stars
* Title: Citre Shine Moisture Burst Shampoo - 16 fl oz, Rating: 5.0 stars
* Title: Jean Nate Silkening Body Powder, 6 Ounce, Rating: 5.0 stars
* Title: Citre Shine Moisture Burst Shampoo - 16 fl oz, Rating: 5.0 stars


Based on above rating history and similar users' rating history, please predict user's rating for the prod