In [16]:
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openai
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

In [17]:
# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'movie-ml-latest-small/merged_data.csv')
print(f'Data path: {data_path}')



# few shot save path
CF_FEW_SHOT_1_OBS_SAVE_PATH = os.path.join(DATA_DIR, 'movie-ml-latest-small/CF_large_1_test_predictions_few_shot.csv')
print(f'Few shot save path: {CF_FEW_SHOT_1_OBS_SAVE_PATH}')

Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/movie-ml-latest-small/merged_data.csv
Few shot save path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/movie-ml-latest-small/CF_large_1_test_predictions_few_shot.csv


In [18]:
# Read the data
data = pd.read_csv(data_path)

# get statistic and first few data of NUM_SAMPLES rows
data.info()
data.head(NUM_EXAMPLES)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  3405 non-null   int64  
 1   imdbId   3405 non-null   int64  
 2   tmdbId   3405 non-null   float64
 3   title    3405 non-null   object 
 4   genres   3405 non-null   object 
 5   userId   3405 non-null   int64  
 6   rating   3405 non-null   float64
 7   tag      3405 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 212.9+ KB


Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId,rating,tag
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,113497,8844.0,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,113497,8844.0,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game


In [19]:
# user-item interaction matrix
def create_interaction_matrix(df, user_col, item_col, rating_col, threshold=0):
    """
    Create the user-item interactions matrix.
    
    :param df: DataFrame containing user-item interactions.
    :param user_col: Name of the user column.
    :param item_col: Name of the item column.
    :param rating_col: Name of the rating column.
    :param threshold: Minimum rating to consider.
    :return: A sparse user-item interaction matrix and mapper dictionaries.
    """
    interactions = df.groupby([user_col, item_col])[rating_col] \
                     .sum().unstack().reset_index(). \
                     fillna(0).set_index(user_col)
    interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    
    user_mapper = dict(zip(np.unique(df[user_col]), list(range(df[user_col].nunique()))))
    item_mapper = dict(zip(np.unique(df[item_col]), list(range(df[item_col].nunique()))))

    user_inv_mapper = dict(zip(list(range(df[user_col].nunique())), np.unique(df[user_col])))
    item_inv_mapper = dict(zip(list(range(df[item_col].nunique())), np.unique(df[item_col])))

    user_index = [user_mapper[i] for i in interactions.index]
    item_index = [item_mapper[i] for i in interactions.columns]

    X = csr_matrix(interactions.values)

    return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

# Function to fit the kNN model
def fit_knn_model(interaction_matrix, n_neighbors=4):
    """
    Fit the k-Nearest Neighbors model.
    
    :param interaction_matrix: User-item interaction matrix.
    :param n_neighbors: Number of neighbors to consider.
    :return: Trained kNN model.
    """
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(interaction_matrix)
    return model_knn

# Find similar users and recommend items
def recommend_items(user_id, interaction_matrix, user_mapper, item_inv_mapper, model_knn, n_recommendations=4):
    """
    Recommend items for a given user.
    
    :param user_id: User ID for whom to make recommendations.
    :param interaction_matrix: User-item interaction matrix.
    :param user_mapper: Dictionary mapping user ID to user index.
    :param item_inv_mapper: Dictionary mapping item index to item ID.
    :param model_knn: Trained kNN model.
    :param n_recommendations: Number of recommendations to make.
    :return: List of recommended item IDs.
    """
    user_idx = user_mapper[user_id]
    distances, indices = model_knn.kneighbors(interaction_matrix[user_idx], n_neighbors=n_recommendations+1)
    
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    recommendations = []
    for i, (idx, dist) in enumerate(raw_recommends):
        if idx != user_idx:  # Skip the user itself
            recommendations.append(item_inv_mapper[idx])

    return recommendations


In [20]:
# %%time

# # Example DataFrame columns
# user_col = 'reviewerID'  # Replace with your user column name
# item_col = 'asin'    # Replace with your item column name
# rating_col = 'rating' # Replace with your rating column name

# # Step 1: Create User-Item Interaction Matrix
# interaction_matrix, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_interaction_matrix(data, user_col, item_col, rating_col)

# # Step 2: Fit the kNN Model
# model_knn = fit_knn_model(interaction_matrix)

# # Step 3: Make Recommendations for a specific user
# user_id = 'ANV9L0JU6BNL'  # Replace with a user ID from your dataset
# recommendations = recommend_items(user_id, interaction_matrix, user_mapper, item_inv_mapper, model_knn, n_recommendations=4)

# print("Recommended Items:", recommendations)


In [21]:
# # Convert a small part of the interaction matrix to a dense format for printing
# dense_matrix_sample = pd.DataFrame(interaction_matrix[:10].toarray(), index=user_mapper.keys())
# print("User-Item Interaction Matrix (Sample):")
# dense_matrix_sample

In [22]:
# # Create a dense matrix for similarity computation (use with caution for large datasets)
# dense_interaction_matrix = interaction_matrix.toarray()

# # Compute cosine similarity matrix
# user_similarity_matrix = cosine_similarity_manual(dense_interaction_matrix)

# # Convert to DataFrame for easier plotting
# user_similarity_df = pd.DataFrame(user_similarity_matrix, index=user_mapper.keys(), columns=user_mapper.keys())


In [23]:
# # Customizing the cubehelix palette 
# custom_palette = sns.cubehelix_palette(start=2.8, rot=.1, as_cmap=True)

# # Plotting the heatmap
# plt.figure(figsize=(10, 8))
# heatmap = sns.heatmap(user_similarity_df, cmap=custom_palette)
# plt.title("User Similarity Heatmap", fontweight='bold')
# plt.xlabel("User ID")
# plt.ylabel("User ID")

# # Adding a title to the color bar
# cbar = heatmap.collections[0].colorbar
# cbar.set_label('Similarity Score', rotation=270, labelpad=15)

# # Save the plot to the images folder
# plt.savefig("../images/user_similarity_heatmap.png", bbox_inches='tight')

# # Show the plot
# plt.show()


# Collaborative filtering model to predict ratings - User based CF

In [24]:


def get_all_similar_users_ratings(data, user_mapper, user_inv_mapper, model_knn, interaction_matrix, title_column_name='title'):
    all_similar_users_ratings = {}
    for user_id in data['reviewerID'].unique():
        similar_users_ratings = []
        user_idx = user_mapper.get(user_id)
        if user_idx is None:
            print(f"No index found for user_id: {user_id}")
            continue

        n_samples = interaction_matrix.shape[0]
        n_neighbors = min(10, n_samples)
        distances, indices = model_knn.kneighbors(interaction_matrix[user_idx], n_neighbors=n_neighbors)

        for idx in indices.flatten():
            if idx == user_idx:
                continue
            
            similar_user_id = user_inv_mapper[idx]
            similar_user_data = data[data['reviewerID'] == similar_user_id]

            for _, row in similar_user_data.iterrows():
                if pd.isna(row[title_column_name]) or pd.isna(row['rating']):
                    continue

                similar_users_ratings.append(f"{row[title_column_name]} ({row['rating']} stars)")

        all_similar_users_ratings[user_id] = similar_users_ratings

    return all_similar_users_ratings




def predict_ratings_with_collaborative_filtering_and_save(data, interaction_matrix, user_mapper, item_mapper, user_inv_mapper, model_knn,
                                                          columns_for_training, columns_for_prediction, 
                                                          title_column_name='title', asin_column_name='asin',
                                                          obs_per_user=None, pause_every_n_users=5, sleep_time=5,
                                                          save_path='similar_users_predictions.csv'):
    
    all_similar_users_ratings = get_all_similar_users_ratings(data, user_mapper, user_inv_mapper, model_knn, interaction_matrix, title_column_name)
    predicted_ratings = []
    actual_ratings = []

    for idx, user_id in enumerate(data['reviewerID'].unique()):
        user_data = data[data['reviewerID'] == user_id].sample(frac=1, random_state=42).reset_index(drop=True)
        print(f"\n-------------------\nUser {user_id} ({idx + 1}/{len(data['reviewerID'].unique())}):")
        if len(user_data) < 5:
            print(f"Insufficient data points for user {user_id}, skipping...")
            continue

        for test_idx, test_row in user_data.iterrows():
            train_data = user_data[user_data[asin_column_name] != test_row[asin_column_name]]
            n_train_items = min(4, len(train_data))
            if n_train_items == 0:
                print(f"No training data for user {user_id}, item {test_row[asin_column_name]}, skipping...")
                continue

            similar_users_ratings_str = '\n'.join(all_similar_users_ratings.get(user_id, []))
            print(f"\nSimilar users' ratings:\n{similar_users_ratings_str}")
            if not similar_users_ratings_str:
                print(f"No similar users' ratings found for user {user_id}, skipping prediction.")
                continue

            prediction_data = {col: test_row[col] for col in columns_for_prediction if col != 'rating'}
            print(f"\nPrediction data: {prediction_data}")
            combined_text = generate_combined_text_for_prediction(columns_for_prediction, *prediction_data.values())
            print(f"\nCombined text: {combined_text}")

            try:
                predicted_rating = predict_rating_combined_ChatCompletion(
                    combined_text, 
                    rating_history=similar_users_ratings_str, 
                    approach="CF"
                )
                if predicted_rating is None:
                    print("\nPrediction failed, skipping...")
                    continue
            except Exception as e:
                print(f"\nError during prediction: {e}, skipping...")
                continue

            product_title = test_row.get(title_column_name, "Unknown Title")
            product_details = f"{product_title} (Code: {test_row[asin_column_name]})" if asin_column_name in test_row else product_title

            print(f"Predicted Rating - {predicted_rating} stars for '{product_details}'")
            predicted_ratings.append(predicted_rating)
            actual_ratings.append(test_row['rating'])

            if obs_per_user and len(predicted_ratings) >= obs_per_user:
                break

        if (idx + 1) % pause_every_n_users == 0:
            print(f"\nProcessed {idx + 1} users, pausing for {sleep_time} seconds...")
            time.sleep(sleep_time)

    predicted_ratings_df = pd.DataFrame({'predicted_rating': predicted_ratings, 'actual_rating': actual_ratings})
    predicted_ratings_df.to_csv(save_path, index=False)

    print("Predictions completed and saved.")


In [25]:
user_col = 'userId'   # Column name for users
item_col = 'movieId'         # Column name for items
rating_col = 'rating'     # Column name for ratings

# Create User-Item Interaction Matrix
interaction_matrix, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_interaction_matrix(data, user_col, item_col, rating_col)

# Fit the kNN Model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
model_knn.fit(interaction_matrix)


# Columns used for training and prediction
columns_for_training = ['title']
columns_for_prediction = ['title']


In [29]:
%%time 

predict_ratings_with_collaborative_filtering_and_save(
    data=data,
    interaction_matrix=interaction_matrix,
    user_mapper=user_mapper,
    item_mapper=item_mapper,
    user_inv_mapper=user_inv_mapper,
    model_knn=model_knn,
    columns_for_training=columns_for_training,
    columns_for_prediction=columns_for_prediction,
    title_column_name='title',  
    # user_column_name='userId',
    asin_column_name='movieId',    
    obs_per_user=1,             # Number of observations per user for testing
    save_path=CF_FEW_SHOT_1_OBS_SAVE_PATH
)

KeyError: 'reviewerID'

In [None]:
# Evaluate Few-shot Model
evaluate_model_predictions_rmse_mae(
    data_path=CF_FEW_SHOT_1_OBS_SAVE_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)