In [6]:
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openai
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

In [7]:
# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'amazon-beauty/large_merged_data.csv')
print(f'Data path: {data_path}')



# few shot save path
CF_FEW_SHOT_1_OBS_SAVE_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/CF_large_1_test_predictions_few_shot.csv')
print(f'Few shot save path: {CF_FEW_SHOT_1_OBS_SAVE_PATH}')

Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv
Few shot save path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/output/CF_large_1_test_predictions_few_shot.csv


In [8]:
# Read the data
data = pd.read_csv(data_path)

# get statistic and first few data of NUM_SAMPLES rows
data.info()
data.head(NUM_EXAMPLES)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9767 entries, 0 to 9766
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rating          9767 non-null   float64
 1   verified        9767 non-null   bool   
 2   reviewTime      9767 non-null   object 
 3   reviewerID      9767 non-null   object 
 4   asin            9767 non-null   object 
 5   reviewerName    9767 non-null   object 
 6   reviewText      9759 non-null   object 
 7   summary         9759 non-null   object 
 8   unixReviewTime  9767 non-null   object 
 9   vote            1487 non-null   object 
 10  style           6768 non-null   object 
 11  category        9767 non-null   object 
 12  tech1           2 non-null      object 
 13  description     9767 non-null   object 
 14  fit             0 non-null      float64
 15  title           9767 non-null   object 
 16  also_buy        9767 non-null   object 
 17  tech2           0 non-null      f

Unnamed: 0,rating,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price
0,1.0,False,2015-08-25,A2RYSCZOPEXOCQ,9790787006,The Cat Next Door,"I use a lot of perfume, I go through a new bot...",This is not going to be my favorite scent.,2015-08-25,,...,,Jenna Jameson,[],298.0,"['B00357FTX8', 'B01NBID7FJ', 'B0017JT658']","{'Shipping Weight:': '12.8 ounces (', 'ASIN: '...",All Beauty,,,13.85
1,5.0,False,2001-06-08,A141OPVE376YFI,B000050B65,Paul G.,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",2001-06-08,81.0,...,,Norelco,[],2.0,"['B01B1O9DOM', 'B00JITDVD2', 'B01KXV16DK', 'B0...",{},All Beauty,,,
2,5.0,False,2001-06-08,A141OPVE376YFI,B000050B65,Paul G.,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",2001-06-08,81.0,...,,Norelco,[],2.0,"['B01B1O9DOM', 'B00JITDVD2', 'B01KXV16DK', 'B0...",{},All Beauty,,,
3,5.0,True,2008-07-25,A1TVTDKNMSQ7XU,B000050B6B,Grandpa Pipes,I've had many Norelco razors in my 50 years of...,Just like new.....,2008-07-25,,...,,Philips Norelco,[],148.0,"['B001IA0PCY', 'B00196W5S4', 'B004URZADG', 'B0...",{'\n Product Dimensions: \n ': '5.1 x 0....,All Beauty,,,64.5
4,5.0,True,2008-07-25,A1TVTDKNMSQ7XU,B000050B6B,Grandpa Pipes,I've had many Norelco razors in my 50 years of...,Just like new.....,2008-07-25,,...,,Philips Norelco,[],148.0,"['B001IA0PCY', 'B00196W5S4', 'B004URZADG', 'B0...",{'\n Product Dimensions: \n ': '5.1 x 0....,All Beauty,,,64.5


In [9]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from utils import *
from constants import *
from tenacity import retry, stop_after_attempt, wait_random_exponential
import random


# Create Interaction Matrix
def create_interaction_matrix(df, user_col, item_col, rating_col, threshold=1):
    interactions = df.groupby([user_col, item_col])[rating_col].sum().unstack().reset_index().fillna(0).set_index(user_col)
    interactions = interactions.applymap(lambda x: 1 if x >= threshold else 0)
    user_mapper = dict(zip(np.unique(df[user_col]), list(range(df[user_col].nunique()))))
    item_mapper = dict(zip(np.unique(df[item_col]), list(range(df[item_col].nunique()))))
    user_inv_mapper = dict(zip(list(range(df[user_col].nunique())), np.unique(df[user_col])))
    item_inv_mapper = dict(zip(list(range(df[item_col].nunique())), np.unique(df[item_col])))
    return csr_matrix(interactions.values), user_mapper, item_mapper, user_inv_mapper, item_inv_mapper



def fit_knn_model(interaction_matrix, n_neighbors=4):
    """
    Fit the k-Nearest Neighbors model.

    Args:
        interaction_matrix (csr_matrix): User-item interaction matrix.
        n_neighbors (int): Number of neighbors to consider.

    Returns:
        NearestNeighbors: Trained kNN model.
    """
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(interaction_matrix)
    return model_knn

def recommend_items(user_id, interaction_matrix, user_mapper, item_inv_mapper, model_knn, n_recommendations=4):
    """
    Recommend items for a given user based on kNN model.

    Args:
        user_id (str): User ID for whom to make recommendations.
        interaction_matrix (csr_matrix): User-item interaction matrix.
        user_mapper (dict): Dictionary mapping user ID to user index.
        item_inv_mapper (dict): Dictionary mapping item index to item ID.
        model_knn (NearestNeighbors): Trained kNN model.
        n_recommendations (int): Number of recommendations to make.

    Returns:
        list: List of recommended item IDs.
    """
    user_idx = user_mapper[user_id]
    distances, indices = model_knn.kneighbors(interaction_matrix[user_idx], n_neighbors=n_recommendations+1)
    
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    recommendations = [item_inv_mapper[idx] for idx, dist in raw_recommends if idx != user_idx]

    return recommendations



# source RMIT courses
def pearson_correlation(interaction_matrix):
    """
    Compute the Pearson Correlation Coefficient matrix for the user-item interaction matrix.

    Args:
    interaction_matrix (numpy.ndarray): A 2D array where rows represent users and columns represent items.
                                        The values in the matrix are the ratings given by users to items.

    Returns:
    numpy.ndarray: A 2D array representing the Pearson Correlation Coefficients between each pair of users.
    """

    # Get the number of users
    n_users = interaction_matrix.shape[0]

    # Initialize the Pearson Correlation matrix with zeros
    pearson_corr_matrix = np.zeros((n_users, n_users))

    # Small constant to avoid division by zero
    EPSILON = 1e-9

    # Iterate over each pair of users
    for i in range(n_users):
        for j in range(n_users):
            # Get the rating vectors for the current pair of users
            user_i_vec = interaction_matrix[i, :]
            user_j_vec = interaction_matrix[j, :]

            # Create masks for ratings greater than 0 (indicating rated items)
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # Find indices of corrated items (items rated by both users)
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))

            # Skip if no items are corrated
            if len(corrated_index) == 0:
                continue

            # Compute the mean rating for each user over corrated items
            mean_user_i = np.mean(user_i_vec[corrated_index])
            mean_user_j = np.mean(user_j_vec[corrated_index])

            # Compute the deviations from the mean for each user
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            # Calculate the components for Pearson correlation
            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            # Calculate Pearson correlation and handle division by zero
            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

            # Store the similarity in the matrix
            pearson_corr_matrix[i, j] = sim

    return pearson_corr_matrix




def cosine_similarity_manual(interaction_matrix):
    """
    Compute the Cosine Similarity matrix for the user-item interaction matrix.

    Args:
    interaction_matrix (numpy.ndarray): A 2D array where rows represent users and columns represent items.
                                        The values in the matrix are the ratings given by users to items.

    Returns:
    numpy.ndarray: A 2D array representing the Cosine Similarities between each pair of users.
    """

    # Get the number of users
    n_users = interaction_matrix.shape[0]

    # Initialize the Cosine Similarity matrix with zeros
    cosine_sim_matrix = np.zeros((n_users, n_users))

    # Iterate over each pair of users
    for i in range(n_users):
        for j in range(n_users):
            # Get the rating vectors for the current pair of users
            user_i_vec = interaction_matrix[i, :]
            user_j_vec = interaction_matrix[j, :]

            # Compute the dot product between the two vectors
            dot_product = np.dot(user_i_vec, user_j_vec)

            # Compute the magnitude (norm) of each vector
            norm_i = np.linalg.norm(user_i_vec)
            norm_j = np.linalg.norm(user_j_vec)

            # Calculate cosine similarity (handling division by zero)
            if norm_i == 0 or norm_j == 0:
                # If a vector has magnitude 0, the similarity is set to 0
                similarity = 0
            else:
                similarity = dot_product / (norm_i * norm_j)

            # Store the similarity in the matrix
            cosine_sim_matrix[i, j] = similarity

    return cosine_sim_matrix

# Function to get all similar users' ratings
def get_all_similar_users_ratings(data, user_mapper, user_inv_mapper, model_knn, interaction_matrix, title_column_name='title', user_column_name='reviewerID'):
    all_similar_users_ratings = {}
    for user_id in data[user_column_name].unique():
        similar_users_ratings = []
        user_idx = user_mapper.get(user_id)
        if user_idx is None:
            continue
        distances, indices = model_knn.kneighbors(interaction_matrix[user_idx], n_neighbors=20)
        for idx in indices.flatten():
            if idx == user_idx:
                continue
            similar_user_id = user_inv_mapper[idx]
            similar_user_data = data[data[user_column_name] == similar_user_id]
            sampled_ratings = similar_user_data.sample(n=min(5, len(similar_user_data)), random_state=42)
            for _, row in sampled_ratings.iterrows():
                similar_users_ratings.append(f"{row[title_column_name]} ({row['rating']} stars)")
        all_similar_users_ratings[user_id] = similar_users_ratings
    return all_similar_users_ratings

# Fallback strategy
def fallback_prediction_strategy(item_id, data):
    return data[data['asin'] == item_id]['rating'].mean()

# Function to format similar users' ratings
def format_similar_users_ratings(similar_users_ratings):
    if not isinstance(similar_users_ratings, dict):
        return ''
    formatted_ratings = []
    for user_id, ratings in similar_users_ratings.items():
        formatted_ratings.append(f"\n\n{user_id}:")
        for rating in ratings:
            formatted_ratings.append(f"+ {rating}")
    return '\n'.join(formatted_ratings)


# Function to check data sparsity
def check_data_sparsity(df, user_col, item_col):
    total_ratings = len(df)
    num_users = df[user_col].nunique()
    num_items = df[item_col].nunique()
    sparsity = 1 - (total_ratings / (num_users * num_items))
    print(f"Total Ratings: {total_ratings}, Number of Users: {num_users}, Number of Items: {num_items}, Sparsity: {sparsity}")



def predict_ratings_with_collaborative_filtering_and_save(data, interaction_matrix, user_mapper, item_mapper, user_inv_mapper, 
                                                          model_knn, columns_for_training, columns_for_prediction,
                                                          title_column_name='title', 
                                                          user_column_name='reviewerID',
                                                          asin_column_name='asin',
                                                          obs_per_user=None, 
                                                          pause_every_n_users=5, 
                                                          sleep_time=5,
                                                          save_path='collaborative_filtering_predictions.csv'):
    all_similar_users_ratings = get_all_similar_users_ratings(data, user_mapper, user_inv_mapper, model_knn, interaction_matrix, title_column_name)
    results = []

    for idx, user_id in enumerate(data[user_column_name].unique()):
        user_data = data[data[user_column_name] == user_id].sample(frac=1, random_state=42).reset_index(drop=True)
        print(f"\n-------------------\nUser {user_id} ({idx + 1}/{len(data[user_column_name].unique())}):")

        for test_idx, test_row in user_data.iterrows():
            similar_users_ratings_str = format_similar_users_ratings(all_similar_users_ratings.get(user_id))

            prediction_data = {col: test_row[col] for col in columns_for_prediction if col != 'rating'}
            combined_text = generate_combined_text_for_prediction(columns_for_prediction, *prediction_data.values())

            predicted_rating = None
            if similar_users_ratings_str:
                try:
                    predicted_rating, error_message = predict_rating_combined_ChatCompletion(
                        combined_text,
                        rating_history=similar_users_ratings_str,
                        approach="CF"
                    )
                except Exception as e:
                    print(f"\nError during prediction: {e}")
            
            # Fallback strategy
            if predicted_rating is None:
                print(f"Fallback prediction for user {user_id} on item {test_row[asin_column_name]}")
                predicted_rating = fallback_prediction_strategy(test_row[asin_column_name], data)

            item_id = test_row[asin_column_name]
            actual_rating = test_row['rating']
            title = test_row[title_column_name]
            results.append([user_id, item_id, title, actual_rating, predicted_rating])

        if (idx + 1) % pause_every_n_users == 0:
            print(f"\nProcessed {idx + 1} users, pausing for {sleep_time} seconds...")
            time.sleep(sleep_time)

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print("Predictions completed and saved.")


In [10]:
check_data_sparsity(data, 'reviewerID', 'asin')

Total Ratings: 9767, Number of Users: 1608, Number of Items: 1879, Sparsity: 0.9967674268360168


# Collaborative filtering model to predict ratings - User based CF

In [11]:
user_col = 'reviewerID'   # Column name for users
item_col = 'asin'         # Column name for items
rating_col = 'rating'     # Column name for ratings

# Create User-Item Interaction Matrix
interaction_matrix, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_interaction_matrix(data, user_col, item_col, rating_col)

# Fit the kNN Model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
model_knn.fit(interaction_matrix)



In [12]:
# Function to get top N popular items as a fallback
def get_top_n_popular_items(data, n=10):
    popular_items = data.groupby('asin')['rating'].count().sort_values(ascending=False).head(n).index.tolist()
    return popular_items


# Fallback strategy
top_n_popular_items = get_top_n_popular_items(data)
top_n_popular_items


['B0012Y0ZG2',
 'B00006L9LC',
 'B000URXP6E',
 'B001OHV1H4',
 'B0009RF9DW',
 'B000FI4S1E',
 'B00W259T7G',
 'B0010ZBORW',
 'B00021DJ32',
 'B0013NB7DW']

In [13]:
%%time 

predict_ratings_with_collaborative_filtering_and_save(
    data=data,
    interaction_matrix=interaction_matrix,
    user_mapper=user_mapper,
    item_mapper=item_mapper,
    user_inv_mapper=user_inv_mapper,
    model_knn=model_knn,
    columns_for_training=['title'],
    columns_for_prediction=['title'],
    title_column_name='title',  
    user_column_name='reviewerID',
    asin_column_name='asin',    
    obs_per_user=1,             # Number of observations per user for testing
    save_path=CF_FEW_SHOT_1_OBS_SAVE_PATH
)


-------------------
User A2RYSCZOPEXOCQ (1/1608):
Fallback prediction for user A2RYSCZOPEXOCQ on item B00NT0AR7E
Fallback prediction for user A2RYSCZOPEXOCQ on item B00AL4XD7M
Fallback prediction for user A2RYSCZOPEXOCQ on item B0002564EE
Fallback prediction for user A2RYSCZOPEXOCQ on item 9790787006
Fallback prediction for user A2RYSCZOPEXOCQ on item B0002564EE

-------------------
User A141OPVE376YFI (2/1608):
Fallback prediction for user A141OPVE376YFI on item B000050B65
Fallback prediction for user A141OPVE376YFI on item B000050B65
Fallback prediction for user A141OPVE376YFI on item B0002MQ9GK
Fallback prediction for user A141OPVE376YFI on item B000050FDY
Fallback prediction for user A141OPVE376YFI on item B0002MQ9GK
Fallback prediction for user A141OPVE376YFI on item B000050FDY
Fallback prediction for user A141OPVE376YFI on item B000EG8HLE

-------------------
User A1TVTDKNMSQ7XU (3/1608):
Fallback prediction for user A1TVTDKNMSQ7XU on item B000050B6B
Fallback prediction for user

KeyboardInterrupt: 

In [None]:
# Evaluate Few-shot Model
evaluate_model_predictions_rmse_mae(
    data_path=CF_FEW_SHOT_1_OBS_SAVE_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)