In [8]:
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.metrics.pairwise import cosine_similarity

# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

In [9]:
# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'amazon-beauty/merged_data.csv')
print(f'Data path: {data_path}')

# zero shot save path
ZERO_SHOT_SAVE_PATH = os.path.join(DATA_DIR, 'amazon-beauty/CF_predictions_zero_shot.csv')
print(f'Zero shot save path: {ZERO_SHOT_SAVE_PATH}')

# few shot save path
FEW_SHOT_1_OBS_SAVE_PATH = os.path.join(DATA_DIR, 'amazon-beauty/CF_1_test_predictions_few_shot.csv')
print(f'Few shot save path: {FEW_SHOT_1_OBS_SAVE_PATH}')

Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/merged_data.csv
Zero shot save path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/CF_predictions_zero_shot.csv
Few shot save path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/CF_1_test_predictions_few_shot.csv


In [10]:
# Read the data
data = pd.read_csv(data_path)

# get statistic and first few data of NUM_SAMPLES rows
data.info()
data.head(NUM_EXAMPLES)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rating          34 non-null     float64
 1   verified        34 non-null     bool   
 2   reviewTime      34 non-null     object 
 3   reviewerID      34 non-null     object 
 4   asin            34 non-null     object 
 5   reviewerName    34 non-null     object 
 6   reviewText      34 non-null     object 
 7   summary         34 non-null     object 
 8   unixReviewTime  34 non-null     object 
 9   vote            3 non-null      float64
 10  style           17 non-null     object 
 11  category        34 non-null     object 
 12  tech1           0 non-null      float64
 13  description     34 non-null     object 
 14  fit             0 non-null      float64
 15  title           34 non-null     object 
 16  also_buy        34 non-null     object 
 17  tech2           0 non-null      float

Unnamed: 0,rating,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price
0,5.0,True,2015-09-17,ANV9L0JU6BNL,B000052YAN,Dennis,best floss i've used. does not break as easily...,best floss i've used,2015-09-17,,...,,Reach,[],120.0,"['B01I9TJRN4', 'B003XDVERE', 'B0722XHMGZ', 'B0...",{'\n Product Dimensions: \n ': '1 x 1 x ...,All Beauty,,,5.17
1,5.0,True,2015-09-17,ANV9L0JU6BNL,B000052YAN,Dennis,best floss i've used. does not break as easily...,best floss i've used,2015-09-17,,...,,Reach,[],120.0,"['B01I9TJRN4', 'B003XDVERE', 'B0722XHMGZ', 'B0...",{'\n Product Dimensions: \n ': '1 x 1 x ...,All Beauty,,,5.17
2,2.0,True,2018-03-27,A2TU781PWGS09X,B00006L9LC,Amazon Customer,Doesnt smell,Two Stars,2018-03-27,,...,,Citre Shine,[],1.0,[],"{'ASIN: ': 'B00006L9LC', 'UPC:': '795827187965...",All Beauty,,,23.0
3,2.0,True,2018-03-27,A2TU781PWGS09X,B00006L9LC,Amazon Customer,Doesnt smell,Two Stars,2018-03-27,,...,,Citre Shine,[],1.0,[],"{'ASIN: ': 'B00006L9LC', 'UPC:': '795827187965...",All Beauty,,,23.0
4,5.0,True,2017-02-02,A3A8F2URN7MEPR,B00006L9LC,Sheila T.,My favorite powder!,Five Stars,2017-02-02,,...,,Citre Shine,[],1.0,[],"{'ASIN: ': 'B00006L9LC', 'UPC:': '795827187965...",All Beauty,,,23.0


In [17]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Function to create a user-item interaction matrix
def create_interaction_matrix(df, user_col, item_col, rating_col, threshold=0):
    """
    Create the user-item interactions matrix.
    
    :param df: DataFrame containing user-item interactions.
    :param user_col: Name of the user column.
    :param item_col: Name of the item column.
    :param rating_col: Name of the rating column.
    :param threshold: Minimum rating to consider.
    :return: A sparse user-item interaction matrix and mapper dictionaries.
    """
    interactions = df.groupby([user_col, item_col])[rating_col] \
                     .sum().unstack().reset_index(). \
                     fillna(0).set_index(user_col)
    interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    
    user_mapper = dict(zip(np.unique(df[user_col]), list(range(df[user_col].nunique()))))
    item_mapper = dict(zip(np.unique(df[item_col]), list(range(df[item_col].nunique()))))

    user_inv_mapper = dict(zip(list(range(df[user_col].nunique())), np.unique(df[user_col])))
    item_inv_mapper = dict(zip(list(range(df[item_col].nunique())), np.unique(df[item_col])))

    user_index = [user_mapper[i] for i in interactions.index]
    item_index = [item_mapper[i] for i in interactions.columns]

    X = csr_matrix(interactions.values)

    return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

# Function to fit the kNN model
def fit_knn_model(interaction_matrix, n_neighbors=4):
    """
    Fit the k-Nearest Neighbors model.
    
    :param interaction_matrix: User-item interaction matrix.
    :param n_neighbors: Number of neighbors to consider.
    :return: Trained kNN model.
    """
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(interaction_matrix)
    return model_knn

# Function to find similar users and recommend items
def recommend_items(user_id, interaction_matrix, user_mapper, item_inv_mapper, model_knn, n_recommendations=4):
    """
    Recommend items for a given user.
    
    :param user_id: User ID for whom to make recommendations.
    :param interaction_matrix: User-item interaction matrix.
    :param user_mapper: Dictionary mapping user ID to user index.
    :param item_inv_mapper: Dictionary mapping item index to item ID.
    :param model_knn: Trained kNN model.
    :param n_recommendations: Number of recommendations to make.
    :return: List of recommended item IDs.
    """
    user_idx = user_mapper[user_id]
    distances, indices = model_knn.kneighbors(interaction_matrix[user_idx], n_neighbors=n_recommendations+1)
    
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    recommendations = []
    for i, (idx, dist) in enumerate(raw_recommends):
        if idx != user_idx:  # Skip the user itself
            recommendations.append(item_inv_mapper[idx])

    return recommendations


In [20]:
%%time

# Example DataFrame columns
user_col = 'reviewerID'  # Replace with your user column name
item_col = 'asin'    # Replace with your item column name
rating_col = 'rating' # Replace with your rating column name

# Step 1: Create User-Item Interaction Matrix
interaction_matrix, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_interaction_matrix(data, user_col, item_col, rating_col)

# Step 2: Fit the kNN Model
model_knn = fit_knn_model(interaction_matrix)

# Step 3: Make Recommendations for a specific user
user_id = 'ANV9L0JU6BNL'  # Replace with a user ID from your dataset
recommendations = recommend_items(user_id, interaction_matrix, user_mapper, item_inv_mapper, model_knn, n_recommendations=4)

print("Recommended Items:", recommendations)


Recommended Items: ['B0012Y0ZG2', 'B000URXP6E', 'B00006L9LC', 'B000052YAN']
CPU times: user 12.2 ms, sys: 6.09 ms, total: 18.3 ms
Wall time: 15.7 ms


# Collaborative filtering model to predict ratings instead of providing item recommendations

+ Calculate Similarity Scores: Instead of recommending items based on the nearest neighbors, use the similarity scores between users to predict the rating a user might give to an item.

+ Predict Ratings: For a given user and item, find users similar to the given user who have rated that item. Use their ratings, weighted by similarity, to predict the rating for the target user.

+ Integrate OpenAI API: Utilize embeddings from the OpenAI API to calculate similarity scores between users based on their reviews or other textual data.

In [22]:
# Calculate User Similarities
def get_user_embedding(user_id, df, model=EMBEDDING_MODEL):
    """
    Get the embedding for a given user based on their reviews or other textual data.

    :param user_id: ID of the user.
    :param df: DataFrame containing user data.
    :param model: Embedding model name.
    :return: Embedding vector for the user.
    """
    user_reviews = df[df['reviewerID'] == user_id]['reviewText'].str.cat(sep=' ')
    user_embedding = get_embedding(user_reviews, model)
    return user_embedding

# Predict Ratings
def predict_rating(user_id, item_id, df, interaction_matrix, user_mapper, item_mapper, user_inv_mapper, model_knn, model=EMBEDDING_MODEL):
    """
    Predict the rating a user would give to an item.

    :param user_id: ID of the user.
    :param item_id: ID of the item.
    :param df: DataFrame containing user-item interaction data.
    :param interaction_matrix: User-item interaction matrix.
    :param user_mapper: Dictionary mapping user ID to user index.
    :param item_mapper: Dictionary mapping item ID to item index.
    :param user_inv_mapper: Dictionary mapping user index to user ID.
    :param model_knn: Trained kNN model.
    :param model: Embedding model name.
    :return: Predicted rating.
    """
    user_idx = user_mapper[user_id]
    _, indices = model_knn.kneighbors(interaction_matrix[user_idx], n_neighbors=10)

    # Get user embedding
    user_embedding = get_user_embedding(user_id, df, model)

    total_similarity = 0
    weighted_ratings = 0

    for idx in indices.flatten():
        similar_user_id = user_inv_mapper[idx]
        if similar_user_id != user_id and item_id in df[df['reviewerID'] == similar_user_id]['asin'].values:
            similar_user_rating = df[(df['reviewerID'] == similar_user_id) & (df['asin'] == item_id)]['rating'].values[0]
            similar_user_embedding = get_user_embedding(similar_user_id, df, model)
            
            similarity = cosine_similarity([user_embedding], [similar_user_embedding])[0][0]
            total_similarity += similarity
            weighted_ratings += similarity * similar_user_rating

    if total_similarity > 0:
        return weighted_ratings / total_similarity
    else:
        return np.mean(df[df['asin'] == item_id]['rating'])  # Fallback to average rating for the item


In [24]:
user_id = 'ANV9L0JU6BNL'  
item_id = 'B000052YAN'

predicted_rating = predict_rating(user_id, item_id, data, interaction_matrix, user_mapper, item_mapper, user_inv_mapper, model_knn)
print(f"Predicted rating for user {user_id} and item {item_id}: {predicted_rating}")


ValueError: Expected n_neighbors <= n_samples,  but n_samples = 5, n_neighbors = 10