In [10]:
import random
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Add the path to the constants file to the system path
sys.path.append('../../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *
from MF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, '../data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'ml-1m/merged_data.dat')
print(f'Data path: {data_path}')

# output

CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_OUTPUT_PATH}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_RERUN_PATH}')


# Constants for column names
USER_COLUMN_NAME = 'UserID'
TITLE_COLUMN_NAME = 'Title'
ITEM_ID_COLUMN = 'MovieID'
RATING_COLUMN_NAME = 'Rating'
TIME_STAMP_COLUMN_NAME = 'Timestamp'

SYSTEM_CONTENT = MOVIELENS_CONTENT_SYSTEM



Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/merged_data.dat
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/large_CF_fewshot_output_path_ratings_per_user.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/rerun_large_CF_fewshot_output_path_ratings_per_user.csv


In [11]:
CF_OUTPUT_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'ml-1m/output/split_timestamp/timestamp_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_OUTPUT_TIMESTAMP_PATH}')

CF_RERUN_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'ml-1m/output/split_timestamp/rerun_timestamp_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_RERUN_TIMESTAMP_PATH}')

Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/split_timestamp/timestamp_large_CF_fewshot_output_path_ratings_per_user.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/split_timestamp/rerun_timestamp_large_CF_fewshot_output_path_ratings_per_user.csv


In [12]:
data = pd.read_csv(data_path)
data.head(3)


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama


In [13]:
num_users = data[USER_COLUMN_NAME].nunique()
num_items = data[ITEM_ID_COLUMN].nunique()

print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")

Number of users: 6040
Number of items: 3706


In [14]:
# Create User-Item Interaction Matrix
interaction_matrix = pd.pivot_table(data, index=USER_COLUMN_NAME, columns=ITEM_ID_COLUMN, values=RATING_COLUMN_NAME).fillna(0)
csr_interaction_matrix = csr_matrix(interaction_matrix.values)

interaction_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
csr_interaction_matrix

<6040x3706 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

# PCC

In [16]:
%%time
# Compute the user-user Pearson Correlation Coefficient Matrix
user_pcc_matrix = pearson_correlation(csr_interaction_matrix)
print(f'User PCC Matrix:\n{user_pcc_matrix}\n')

# Compute the item-item Pearson Correlation Coefficient Matrix
# Assuming the function 'item_pearson_correlation' takes a dense matrix as input.
# If it still takes a csr_matrix, then convert it inside the function.
dense_interaction_matrix = csr_interaction_matrix.toarray()

item_pcc_matrix = item_pearson_correlation(dense_interaction_matrix.T)
print(f'Item PCC Matrix:\n{item_pcc_matrix}\n')


User PCC Matrix:
[[ 1.          0.41666667 -0.33218192 ...  0.          0.05685735
  -0.04351941]
 [ 0.41666667  1.          0.23683386 ... -0.5         0.57207755
  -0.0271435 ]
 [-0.33218192  0.23683386  1.         ...  0.5         0.30927686
  -0.39528471]
 ...
 [ 0.         -0.5         0.5        ...  1.          0.27116307
  -0.39712226]
 [ 0.05685735  0.57207755  0.30927686 ...  0.27116307  1.
   0.24230884]
 [-0.04351941 -0.0271435  -0.39528471 ... -0.39712226  0.24230884
   1.        ]]

Starting item-item Pearson Correlation computation...
Item-item Pearson Correlation computation completed.
Item PCC Matrix:
[[ 1.          0.11379278 -0.06213208 ...  0.          0.03924414
   0.04917266]
 [ 0.11379278  1.          0.09073491 ... -0.05486492  0.16924376
  -0.04166374]
 [-0.06213208  0.09073491  1.         ...  0.09738972  0.09831492
  -0.3249007 ]
 ...
 [ 0.         -0.05486492  0.09738972 ...  0.8         0.12352349
  -0.16379402]
 [ 0.03924414  0.16924376  0.09831492 ...  0.

In [31]:


def predict_ratings_with_CF_item_and_save(data, user_pcc_matrix, item_pcc_matrix,
                                              user_column_name='reviewerID', 
                                              movie_column_name='title', 
                                              movie_id_column='asin',
                                              rating_column_name=RATING_COLUMN_NAME, 
                                              timestamp_column_name='Timestamp', 
                                              num_ratings_per_user=NUM_RATINGS_PER_USER, 
                                              num_similar_users=NUM_SIMILAR_USERS,
                                              num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                              test_selection_method='random',
                                              save_path='cf_predictions.csv', 
                                              seed=RANDOM_STATE,
                                              system_content=AMAZON_CONTENT_SYSTEM,
                                              test_set_type='popular'):
    results = []

    unique_users = data[user_column_name].unique()
    unique_items = data[movie_id_column].unique()

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(unique_items)}

    random.seed(seed)

    for user_id in unique_users:
        user_idx = user_id_to_index[user_id]
        print(f"Processing user {user_id} (Index: {user_idx})")

        main_user_data = data[data[user_column_name] == user_id]
        
        # Select test set based on the specified method
        if test_selection_method == 'random':
            test_set, remaining_data = select_test_set_for_user(main_user_data, num_tests=num_main_user_ratings, seed=seed)
        elif test_selection_method == 'sequential':
            test_set, remaining_data = sequential_train_test_split(main_user_data, time_column=timestamp_column_name)
        elif test_selection_method == 'popularity':
            # Replace this part in your code:
            test_set, remaining_data = popularity_based_random_split(main_user_data, 
                                                         item_column=movie_id_column, 
                                                         rating_column=rating_column_name,  # Remove review_column
                                                         test_ratio=TEST_RATIO,
                                                         seed=seed,
                                                         test_set_type=test_set_type)


        if test_set.empty:
            print(f"No test data available for user {user_id}.")
            continue

        for random_movie_row in test_set.itertuples():
            random_movie_title = getattr(random_movie_row, movie_column_name)
            random_movie_id = getattr(random_movie_row, movie_id_column)
            random_movie_index = item_id_to_index.get(random_movie_id)
            actual_rating = getattr(random_movie_row, rating_column_name)

            if random_movie_index is None or random_movie_index >= item_pcc_matrix.shape[0]:
                print(f"Item '{random_movie_id}' not found or out of bounds in item_pcc_matrix.")
                continue

            if len(remaining_data) < num_main_user_ratings:
                main_user_ratings = remaining_data
            else:
                main_user_ratings = remaining_data.sample(n=num_main_user_ratings, random_state=seed)
            main_user_ratings_str = '\n'.join([
                f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
                for _, row in main_user_ratings.iterrows()
            ])

            similar_users_idx = np.argsort(-user_pcc_matrix[user_idx])[:num_similar_users + 1]
            similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

            similar_users_ratings = ""
            for idx in similar_users_idx:
                similar_user_id = unique_users[idx]
                similar_user_data = data[data[user_column_name] == similar_user_id]

                # Find top-rated items by this similar user, sorted by item PCC
                similar_items_indices = np.argsort(-item_pcc_matrix[random_movie_index, :])
                # Filter similar_items_indices to stay within bounds of unique_items array
                similar_items_indices = similar_items_indices[similar_items_indices < len(unique_items)]

                # Now, you can safely use these indices to filter similar user data
                top_rated_items = similar_user_data[similar_user_data[movie_id_column].isin(unique_items[similar_items_indices])]


                # Extract top ratings from this user
                top_ratings = top_rated_items.nlargest(num_ratings_per_user, rating_column_name)
                for top_rating_row in top_ratings.itertuples():
                    item_id = getattr(top_rating_row, movie_id_column)
                    rating = getattr(top_rating_row, rating_column_name)
                    item_title = data.loc[data[movie_id_column] == item_id, movie_column_name].iloc[0]  # Get the title of the item
                    similar_users_ratings += f"* Title: {item_title}, Rating: {rating} stars\n"

            combined_text = f"Title: {random_movie_title}"
            prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"

            predicted_rating = predict_rating_combined_ChatCompletion(
                combined_text, 
                approach="CF", 
                similar_users_ratings=similar_users_ratings,
                rating_history=main_user_ratings_str,
                system_content=system_content
            )

            results.append([user_id, random_movie_id, random_movie_title, actual_rating, predicted_rating])
            print(f"User {user_id}: Predicted rating for '{random_movie_title}' is {predicted_rating}.")

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")

    return results_df


    



## Split by Popularity popularity_based_random_split

In [32]:
# Sequential Popularity Split
def popularity_based_sequential_split(data, 
                                      item_column='MovieID', 
                                      rating_column='Rating', 
                                      time_column='Timestamp', 
                                      test_ratio=0.2, 
                                      test_set_type='both', 
                                      popularity_percent=0.2):
    """
    Split user data into training and test sets based on item popularity,
    allowing selection of either popular, unpopular, or both types of items in the test set,
    preserving the temporal sequence within the test data.

    Args:
    - data (DataFrame): Dataset containing user data, item identifiers, and timestamps.
    - item_column (str): Name of the column containing the item identifier.
    - review_column (str): Name of the column containing review content to ensure valid entries.
    - rating_column (str): Name of the column containing ratings.
    - time_column (str): Name of the column containing the timestamp.
    - test_ratio (float): Proportion of the dataset to include in the test split.
    - test_set_type (str): Type of test set to return ('popular', 'unpopular', 'both').
    - popularity_percent (float): Proportion of items to consider as popular.

    Returns:
    - DataFrame: Training set.
    - DataFrame: Test set.
    """
    # Preprocessing
    data = data.dropna(subset=[item_column, rating_column, time_column])
    data = data.sort_values(by=time_column)
    print(f"Data after dropping NAs and sorting by {time_column}: {data.shape[0]} rows")

    # Calculate popularity based on the number of ratings
    item_counts = data[item_column].value_counts()
    print(f"Unique items: {len(item_counts)}")

    popularity_score = item_counts.sort_values(ascending=False)
    print(f"Top item counts:\n{popularity_score.head()}")

    # Identify top X% popular items
    top_percent_cutoff = int(len(popularity_score) * popularity_percent)
    popular_items = popularity_score.head(top_percent_cutoff).index
    print(f"Number of popular items: {len(popular_items)}")

    # Sequentially split the entire dataset
    split_index = int(len(data) * (1 - test_ratio))
    train_data, test_data = data[:split_index], data[split_index:]
    print(f"Train data: {train_data.shape[0]} rows, Test data: {test_data.shape[0]} rows")

    # Filter test set based on type
    if test_set_type == 'popular':
        test_set = test_data[test_data[item_column].isin(popular_items)]
        train_data = pd.concat([train_data, test_data[~test_data[item_column].isin(popular_items)]])
    elif test_set_type == 'unpopular':
        test_set = test_data[~test_data[item_column].isin(popular_items)]
        train_data = pd.concat([train_data, test_data[test_data[item_column].isin(popular_items)]])
    else:
        popular_test_set = test_data[test_data[item_column].isin(popular_items)]
        unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]
        test_set = pd.concat([popular_test_set, unpopular_test_set])

    # Count the number of popular and unpopular items in the test set
    popular_test_set = test_data[test_data[item_column].isin(popular_items)]
    unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]

    print(f"Popular test set size: {popular_test_set.shape[0]}")
    print(f"Unpopular test set size: {unpopular_test_set.shape[0]}")
    print(f"Test set (after filtering): {test_set.shape[0]} rows")
    
    return train_data, test_set





def popularity_based_random_split(data, 
                                  item_column='MovieID', 
                                  rating_column='Rating', 
                                  test_ratio=TEST_RATIO, 
                                  seed=RANDOM_STATE, 
                                  test_set_type='both',
                                  popularity_percent=0.2):  # Default is 20%
    """
    Randomly split user data into training and test sets based on item popularity,
    allowing selection of either popular, unpopular, or both types of items in the test set.

    Args:
    - data (DataFrame): Dataset containing user data, item identifiers, and timestamps.
    - item_column (str): Name of the column containing the item identifier.
    - rating_column (str): Name of the column containing ratings.
    - test_ratio (float): Proportion of the dataset to include in the test split.
    - seed (int): Seed for the random number generator for reproducibility.
    - test_set_type (str): Type of test set to return ('popular', 'unpopular', 'both').
    - popularity_percent (float): Proportion of items to consider as popular.

    Returns:
    - DataFrame: Training set.
    - DataFrame: Test set.
    """
    # Preprocessing
    data = data.dropna(subset=[item_column, rating_column])
    print(f"Data after dropping NAs: {data.shape[0]} rows")

    # Calculate popularity based on the number of ratings
    item_counts = data[item_column].value_counts()
    print(f"Unique items: {len(item_counts)}")

    popularity_score = item_counts.sort_values(ascending=False)
    print(f"Top item counts:\n{popularity_score.head()}")

    # Identify top X% popular items
    top_percent_cutoff = int(len(popularity_score) * popularity_percent)
    popular_items = popularity_score.head(top_percent_cutoff).index
    print(f"Number of popular items: {len(popular_items)}")

    # Split entire dataset randomly first
    train_data, test_data = train_test_split(data, test_size=test_ratio, random_state=seed)
    print(f"Train data: {train_data.shape[0]} rows, Test data: {test_data.shape[0]} rows")

    # Filter test set based on type
    if test_set_type == 'popular':
        test_set = test_data[test_data[item_column].isin(popular_items)]
        train_data = pd.concat([train_data, test_data[~test_data[item_column].isin(popular_items)]])
    elif test_set_type == 'unpopular':
        test_set = test_data[~test_data[item_column].isin(popular_items)]
        train_data = pd.concat([train_data, test_data[test_data[item_column].isin(popular_items)]])
    else:
        popular_test_set = test_data[test_data[item_column].isin(popular_items)]
        unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]
        test_set = pd.concat([popular_test_set, unpopular_test_set])

    # Count the number of popular and unpopular items in the test set
    popular_test_set = test_data[test_data[item_column].isin(popular_items)]
    unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]

    print(f"Popular test set size: {popular_test_set.shape[0]}")
    print(f"Unpopular test set size: {unpopular_test_set.shape[0]}")

    print(f"Test set (after filtering): {test_set.shape[0]} rows")
    return train_data, test_set



In [33]:


CF_OUTPUT_POPULARITY_PATH = os.path.join(DATA_DIR, 'ml-1m/output/split_popular/CF_fewshot_PCC_popularity.csv')
print(f'Data path: {CF_OUTPUT_PATH}')

CF_RERUN_OUTPUT_POPULARITY_PATH = os.path.join(DATA_DIR, 'ml-1m/output/split_popular/rerun_large_CF_fewshot_PCC_popularity.csv')
print(f'Data path: {CF_RERUN_PATH}')



Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/large_CF_fewshot_output_path_ratings_per_user.csv
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/rerun_large_CF_fewshot_output_path_ratings_per_user.csv


In [34]:
%%time

# Constants for column names
USER_COLUMN_NAME = 'UserID'
TITLE_COLUMN_NAME = 'Title'
ITEM_ID_COLUMN = 'MovieID'
RATING_COLUMN_NAME = 'Rating'
TIME_STAMP_COLUMN_NAME = 'Timestamp'

SYSTEM_CONTENT = MOVIELENS_CONTENT_SYSTEM



CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 20 µs


In [35]:
train_data, test_set = popularity_based_random_split(data, 
                                  item_column=ITEM_ID_COLUMN, 
                                  test_ratio=TEST_RATIO,
                                  seed=RANDOM_STATE,
                                  test_set_type='popular',
                                  )

Data after dropping NAs: 1000209 rows
Unique items: 3706
Top item counts:
2858    3428
260     2991
1196    2990
1210    2883
480     2672
Name: MovieID, dtype: int64
Number of popular items: 741
Train data: 800167 rows, Test data: 200042 rows
Popular test set size: 130499
Unpopular test set size: 69543
Test set (after filtering): 130499 rows


In [36]:
print(f'Test data: {test_set.info()} \n')
test_set

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130499 entries, 413838 to 340403
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   UserID      130499 non-null  int64 
 1   MovieID     130499 non-null  int64 
 2   Rating      130499 non-null  int64 
 3   Timestamp   130499 non-null  int64 
 4   Gender      130499 non-null  object
 5   Age         130499 non-null  int64 
 6   Occupation  130499 non-null  int64 
 7   Zip-code    130499 non-null  object
 8   Title       130499 non-null  object
 9   Genres      130499 non-null  object
dtypes: int64(6), object(4)
memory usage: 11.0+ MB
Test data: None 



Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
413838,423,3623,2,976281626,M,18,4,55455,Mission: Impossible 2 (2000),Action|Thriller
196725,2900,1580,4,971902177,M,18,12,95120,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
612172,5472,1644,2,959900434,M,35,1,27909,I Know What You Did Last Summer (1997),Horror|Mystery|Thriller
226643,1733,2987,4,1037339899,M,18,14,43725,Who Framed Roger Rabbit? (1988),Adventure|Animation|Film-Noir
351092,5753,1573,2,958272495,M,18,17,78758,Face/Off (1997),Action|Sci-Fi|Thriller
...,...,...,...,...,...,...,...,...,...,...
51773,2955,1207,5,971206527,M,35,14,92025,To Kill a Mockingbird (1962),Drama
230110,5568,39,4,959310121,M,18,4,49841,Clueless (1995),Comedy|Romance
439369,5294,2640,3,962762897,M,25,2,60626,Superman (1978),Action|Adventure|Sci-Fi
156612,4411,1090,3,969388689,M,18,4,92122,Platoon (1986),Drama|War


In [37]:
# print train data
print(f'Train data: {train_data.info()} \n')
train_data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 869710 entries, 361778 to 859473
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   UserID      869710 non-null  int64 
 1   MovieID     869710 non-null  int64 
 2   Rating      869710 non-null  int64 
 3   Timestamp   869710 non-null  int64 
 4   Gender      869710 non-null  object
 5   Age         869710 non-null  int64 
 6   Occupation  869710 non-null  int64 
 7   Zip-code    869710 non-null  object
 8   Title       869710 non-null  object
 9   Genres      869710 non-null  object
dtypes: int64(6), object(4)
memory usage: 73.0+ MB
Train data: None 



Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
361778,1894,58,5,974693628,F,35,7,94602,"Postino, Il (The Postman) (1994)",Drama|Romance
233056,714,2770,4,975537089,M,18,4,76013,Bowfinger (1999),Comedy
405069,5273,1358,4,961116187,M,56,20,91030,Sling Blade (1996),Drama|Thriller
419114,5880,1307,5,957546228,M,25,1,08904,When Harry Met Sally... (1989),Comedy|Romance
234096,5754,1759,4,958275902,F,18,1,60640,Four Days in September (1997),Drama
...,...,...,...,...,...,...,...,...,...,...
937273,2978,373,4,970965065,M,35,20,45236,Red Rock West (1992),Thriller
709231,4448,145,3,966959796,M,25,14,06880,Bad Boys (1995),Action
498748,3690,370,3,966309601,M,18,0,63116,Naked Gun 33 1/3: The Final Insult (1994),Comedy
152230,4089,1124,3,965536641,M,25,7,79416,On Golden Pond (1981),Drama


In [46]:
def popularity_based_random_split(data, 
                                  item_column='asin', 
                                  rating_column='rating', 
                                  test_ratio=TEST_RATIO, 
                                  seed=RANDOM_STATE, 
                                  test_set_type='both',
                                  popularity_percent=0.2):
    """
    Randomly split user data into training and test sets based on item popularity,
    allowing selection of either popular, unpopular, or both types of items in the test set.

    Args:
    - data (DataFrame): Dataset containing user data, item identifiers, and timestamps.
    - item_column (str): Name of the column containing the item identifier.
    - rating_column (str): Name of the column containing ratings.
    - test_ratio (float): Proportion of the dataset to include in the test split.
    - seed (int): Seed for the random number generator for reproducibility.
    - test_set_type (str): Type of test set to return ('popular', 'unpopular', 'both').
    - popularity_percent (float): Proportion of items to consider as popular.

    Returns:
    - DataFrame: Training set.
    - DataFrame: Test set.
    """
    # Preprocessing
    data = data.dropna(subset=[item_column, rating_column])
    if data.shape[0] == 0:
        raise ValueError("The dataset is empty after dropping NAs.")
    
    # print(f"Data after dropping NAs: {data.shape[0]} rows")

    # Calculate popularity based on the number of ratings
    item_counts = data[item_column].value_counts()
    # print(f"Unique items: {len(item_counts)}")

    popularity_score = item_counts.sort_values(ascending=False)
    # print(f"Top item counts:\n{popularity_score.head()}")

    # Identify top X% popular items
    top_percent_cutoff = int(len(popularity_score) * popularity_percent)
    popular_items = popularity_score.head(top_percent_cutoff).index
    # print(f"Number of popular items: {len(popular_items)}")

    # Split entire dataset randomly first
    if data.shape[0] > 1:
        train_data, test_data = train_test_split(data, test_size=test_ratio, random_state=seed)
    else:
        train_data = data
        test_data = pd.DataFrame()
    # print(f"Train data: {train_data.shape[0]} rows, Test data: {test_data.shape[0]} rows")

    # Filter test set based on type
    if test_set_type == 'popular':
        test_set = test_data[test_data[item_column].isin(popular_items)]
        train_data = pd.concat([train_data, test_data[~test_data[item_column].isin(popular_items)]])
    elif test_set_type == 'unpopular':
        test_set = test_data[~test_data[item_column].isin(popular_items)]
        train_data = pd.concat([train_data, test_data[test_data[item_column].isin(popular_items)]])
    else:
        popular_test_set = test_data[test_data[item_column].isin(popular_items)]
        unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]
        test_set = pd.concat([popular_test_set, unpopular_test_set])

    # Count the number of popular and unpopular items in the test set
    popular_test_set = test_data[test_data[item_column].isin(popular_items)]
    unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]

    # print(f"Popular test set size: {popular_test_set.shape[0]}")
    # print(f"Unpopular test set size: {unpopular_test_set.shape[0]}")
    # print(f"Test set (after filtering): {test_set.shape[0]} rows")
    
    return train_data, test_set


def predict_ratings_with_CF_item_and_save(data, user_pcc_matrix, item_pcc_matrix,
                                          user_column_name='reviewerID', 
                                          movie_column_name='title', 
                                          movie_id_column='asin',
                                          rating_column_name='Rating', 
                                          timestamp_column_name='Timestamp', 
                                          num_ratings_per_user=10, 
                                          num_similar_users=5,
                                          num_main_user_ratings=5,
                                          test_selection_method='random',
                                          save_path='cf_predictions.csv', 
                                          seed=42,
                                          system_content='AMAZON_CONTENT_SYSTEM',
                                          test_set_type='popular'):
    results = []

    unique_users = data[user_column_name].unique()
    unique_items = data[movie_id_column].unique()

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(unique_items)}

    random.seed(seed)

    for user_id in unique_users:
        user_idx = user_id_to_index[user_id]
        print(f"Processing user {user_id} (Index: {user_idx})")

        main_user_data = data[data[user_column_name] == user_id]
        
        if main_user_data.shape[0] < 2:
            print(f"Not enough data for user {user_id} to perform a split.")
            continue

        # Select test set based on the specified method
        if test_selection_method == 'random':
            test_set, remaining_data = select_test_set_for_user(main_user_data, num_tests=num_main_user_ratings, seed=seed)
        elif test_selection_method == 'sequential':
            test_set, remaining_data = sequential_train_test_split(main_user_data, time_column=timestamp_column_name)
        elif test_selection_method == 'popularity':
            try:
                test_set, remaining_data = popularity_based_random_split(main_user_data, 
                                                         item_column=movie_id_column, 
                                                         rating_column=rating_column_name,  
                                                         test_ratio=TEST_RATIO,
                                                         seed=seed,
                                                         test_set_type=test_set_type)
            except ValueError as e:
                print(f"Skipping user {user_id} due to error: {e}")
                continue

        if test_set.empty:
            print(f"No test data available for user {user_id}.")
            continue

        # Ensure there are enough ratings for sampling
        if len(remaining_data) < num_main_user_ratings:
            main_user_ratings = remaining_data
        else:
            main_user_ratings = remaining_data.sample(n=num_main_user_ratings, random_state=seed)

        main_user_ratings_str = '\n'.join([
            f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
            for index, row in main_user_ratings.iterrows()
        ])

        if main_user_ratings_str == "":
            print(f"No sufficient data to fetch main user ratings for user {user_id}.")
            continue

        for random_movie_row in test_set.itertuples():
            random_movie_title = getattr(random_movie_row, movie_column_name)
            random_movie_id = getattr(random_movie_row, movie_id_column)
            random_movie_index = item_id_to_index.get(random_movie_id)
            actual_rating = getattr(random_movie_row, rating_column_name)

            if random_movie_index is None or random_movie_index >= item_pcc_matrix.shape[0]:
                print(f"Item '{random_movie_id}' not found or out of bounds in item_pcc_matrix.")
                continue

            similar_users_idx = np.argsort(-user_pcc_matrix[user_idx])[:num_similar_users + 1]
            similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

            similar_users_ratings = ""
            for idx in similar_users_idx:
                similar_user_id = unique_users[idx]
                similar_user_data = data[data[user_column_name] == similar_user_id]

                # Find top-rated items by this similar user, sorted by item PCC
                similar_items_indices = np.argsort(-item_pcc_matrix[random_movie_index, :])
                
                # Ensure that the indices are within the bounds of unique_items
                similar_items_indices = similar_items_indices[similar_items_indices < len(unique_items)]

                # Filter similar user data based on valid indices
                top_rated_items = similar_user_data[similar_user_data[movie_id_column].isin(unique_items[similar_items_indices])]

                # Extract top ratings from this user
                top_ratings = top_rated_items.nlargest(num_ratings_per_user, rating_column_name)
                for top_rating_row in top_ratings.itertuples():
                    item_id = getattr(top_rating_row, movie_id_column)
                    rating = getattr(top_rating_row, rating_column_name)
                    item_title = data.loc[data[movie_id_column] == item_id, movie_column_name].iloc[0]  # Get the title of the item
                    similar_users_ratings += f"* Title: {item_title}, Rating: {rating} stars\n"

            combined_text = f"Title: {random_movie_title}"
            prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"

            predicted_rating = predict_rating_combined_ChatCompletion(
                combined_text, 
                approach="CF", 
                similar_users_ratings=similar_users_ratings,
                rating_history=main_user_ratings_str,
                system_content=system_content
            )

            results.append([user_id, random_movie_id, random_movie_title, actual_rating, predicted_rating])
            print(f"User {user_id}: Predicted rating for '{random_movie_title}' is {predicted_rating}.")

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")

    return results_df


In [47]:
%%time

results_df = predict_ratings_with_CF_item_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='popularity',
    save_path=CF_OUTPUT_POPULARITY_PATH, 
    seed=RANDOM_STATE,
    test_set_type='popular'
)



Processing user 1 (Index: 0)
No sufficient data to fetch main user ratings for user 1.
Processing user 2 (Index: 1)
Constructed Prompt for CF approach:

The prompt:
**********


Here is user rating history:
* Title: Courage Under Fire (1996), Rating: 3 stars
* Title: Driving Miss Daisy (1989), Rating: 5 stars
* Title: Double Jeopardy (1999), Rating: 3 stars
* Title: GoodFellas (1990), Rating: 2 stars

Here is the rating history from users who are similar to this user:
* Title: One Flew Over the Cuckoo's Nest (1975), Rating: 5 stars
* Title: Princess Bride, The (1987), Rating: 5 stars
* Title: Erin Brockovich (2000), Rating: 5 stars
* Title: Princess Bride, The (1987), Rating: 5 stars


Based on above rating history and similar users' rating history, please predict user's rating for the product Title: Conspiracy Theory (1997), (1 being lowest and 5 being highest,The output should be like: (x stars, xx%), do not explain the reason.)
**********


System Fingerprint: fp_1867888df6

API cal

In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_POPULARITY_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/split_popular/CF_fewshot_PCC_popularity.csv'

In [None]:
%%time

results_df = predict_ratings_with_CF_item_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='popularity',
    save_path=CF_OUTPUT_POPULARITY_PATH, 
    seed=RANDOM_STATE,
    test_set_type='unpopular'
)

In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_POPULARITY_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

In [None]:
%%time

results_df = predict_ratings_with_CF_item_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='popularity',
    save_path=CF_OUTPUT_POPULARITY_PATH, 
    seed=RANDOM_STATE,
    test_set_type='both'
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_POPULARITY_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)