In [1]:
import random
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openai
# Add the path to the constants file to the system path
sys.path.append('../../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, '../data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'ml-1m/merged_data.dat')
print(f'Data path: {data_path}')

# output

CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user.dat')
print(f'Data path: {CF_OUTPUT_PATH}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user.dat')
print(f'Data path: {CF_RERUN_PATH}')


# Constants for column names
USER_COLUMN_NAME = 'UserID'
TITLE_COLUMN_NAME = 'Title'
ITEM_ID_COLUMN = 'MovieID'
RATING_COLUMN_NAME = 'Rating'

SYSTEM_CONTENT = MOVIELENS_CONTENT_SYSTEM


Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/merged_data.dat
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/CF_fewshot_output_path_ratings_per_user.dat
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/notebook/../data/ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user.dat


In [2]:
data = pd.read_csv(data_path)
data.head(3)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama


In [3]:
# Create User-Item Interaction Matrix
interaction_matrix = pd.pivot_table(data, index=USER_COLUMN_NAME, columns=ITEM_ID_COLUMN, values=RATING_COLUMN_NAME).fillna(0)
csr_interaction_matrix = csr_matrix(interaction_matrix.values)

interaction_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
csr_interaction_matrix

<6040x3706 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

In [5]:
import numpy as np

EPSILON_CONSTANT = 1e-9

def pearson_correlation(interaction_matrix, epsilon_constant=EPSILON_CONSTANT):
    """
    Compute the Pearson Correlation Coefficient matrix for the user-item interaction matrix.

    Args:
        interaction_matrix (numpy.ndarray): A 2D array where rows represent users and columns represent items.
                                             The values in the array are the ratings given by users to items.

    Returns:
        numpy.ndarray: A 2D array representing the Pearson Correlation Coefficients between each pair of users.
    """
    
    # Get the number of users
    n_users = interaction_matrix.shape[0]
    
    # Initialize the Pearson Correlation matrix
    np_user_pearson_corr = np.zeros((n_users, n_users))
    
    print("Starting user-user Pearson Correlation computation...")

    # Iterate over each pair of users
    for i, user_i_vec in enumerate(interaction_matrix):
        for j, user_j_vec in enumerate(interaction_matrix):

            # if i % 50 == 0 and j % 50 == 0:  # Reduce the frequency of print statements
            #     print(f"Processing correlation between users {i} and {j}")

            # Ratings corated by the current pair of users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0
            
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))

            if len(corrated_index) == 0:
                continue
            
            # Average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + epsilon_constant)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + epsilon_constant)
            
            # Compute Pearson correlation
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j
            
            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)
            
            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))
            
            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + epsilon_constant)
            
            np_user_pearson_corr[i][j] = sim

    print("User-user Pearson Correlation computation completed.")
    return np_user_pearson_corr


In [6]:
import numpy as np

DELTA = 25
EPSILON = 1e-9

def item_pearson_correlation(interaction_matrix):
    """
    Compute the Pearson Correlation Coefficient matrix for the item-item interaction matrix with significance weighting.

    Args:
        interaction_matrix (2D numpy array): A matrix where rows represent users and columns represent items.
                                              The values in the matrix are the ratings given by users to items.

    Returns:
        numpy.ndarray: A 2D array representing the Pearson Correlation Coefficients between each pair of items.
    """

    n_items = interaction_matrix.shape[1]  # Number of items
    np_item_pearson_corr = np.zeros((n_items, n_items))  # Initialize the Pearson Correlation matrix

    print("Starting item-item Pearson Correlation computation...")

    for i, item_i_vec in enumerate(interaction_matrix.T):
        for j, item_j_vec in enumerate(interaction_matrix.T):

            # if i % 50 == 0 and j % 50 == 0:  # Reduce the frequency of print statements
            #     print(f"Processing correlation between items {i} and {j}")

            # Ratings co-rated by the current pair of items
            mask_i = item_i_vec > 0
            mask_j = item_j_vec > 0

            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                # print(f"No corrated ratings for items {i} and {j}. Skipping...")
                continue

            mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
            mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

            # print(f"Mean rating for item {i}: {mean_item_i}, item {j}: {mean_item_j}")

            item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
            item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

            r_ui_sub_ri_sq = np.square(item_i_sub_mean)
            r_uj_sub_rj_sq = np.square(item_j_sub_mean)

            r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
            r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

            sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

            weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

            # print(f"Correlation between item {i} and item {j}: {weighted_sim}")

            np_item_pearson_corr[i][j] = weighted_sim

    print("Item-item Pearson Correlation computation completed.")
    return np_item_pearson_corr


In [7]:
# %%
# Compute the user-user Pearson Correlation Coefficient Matrix
dense_interaction_matrix = csr_interaction_matrix.toarray()
user_pcc_matrix = pearson_correlation(dense_interaction_matrix)
print(f'User PCC Matrix:\n{user_pcc_matrix}\n')



Starting user-user Pearson Correlation computation...


In [None]:
# Compute the item-item Pearson Correlation Coefficient Matrix
# Assuming the function 'item_pearson_correlation' takes a dense matrix as input.
# If it still takes a csr_matrix, then convert it inside the function.
item_pcc_matrix = item_pearson_correlation(dense_interaction_matrix.T)
print(f'Item PCC Matrix:\n{item_pcc_matrix}\n')


In [None]:
def predict_ratings_with_CF_item_PCC_and_save(data, user_pcc_matrix, item_pcc_matrix,
                                              user_column_name='reviewerID', 
                                              movie_column_name='title', 
                                              movie_id_column='asin',
                                              rating_column_name='rating', 
                                              num_ratings_per_user=NUM_RATINGS_PER_USER, 
                                              num_similar_users=NUM_SIMILAR_USERS,
                                              num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                              similarity_threshold=0,  # Lowered threshold
                                              save_path='cf_predictions.csv', 
                                              seed=RANDOM_STATE,
                                              system_content=AMAZON_CONTENT_SYSTEM):
    results = []

    unique_users = data[user_column_name].unique()
    unique_items = data[movie_id_column].unique()

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(unique_items)}

    random.seed(seed)

    for user_id in unique_users:
        user_idx = user_id_to_index[user_id]
        print(f"Processing user {user_id} (Index: {user_idx})")

        main_user_data = data[data[user_column_name] == user_id]
        test_set, remaining_data = select_test_set_for_user(main_user_data, num_tests=TEST_OBSERVATION_PER_USER, seed=seed)
        if test_set.empty:
            print(f"No test data available for user {user_id}.")
            continue

        for random_movie_row in test_set.itertuples():
            random_movie_title = getattr(random_movie_row, movie_column_name)
            random_movie_id = getattr(random_movie_row, movie_id_column)
            random_movie_index = item_id_to_index.get(random_movie_id)
            actual_rating = getattr(random_movie_row, rating_column_name)

            if random_movie_index is None or random_movie_index >= item_pcc_matrix.shape[0]:
                print(f"Item '{random_movie_id}' not found or out of bounds in item_pcc_matrix.")
                continue

            if len(remaining_data) < num_main_user_ratings:
                main_user_ratings = remaining_data
            else:
                main_user_ratings = remaining_data.sample(n=num_main_user_ratings, random_state=seed)
            main_user_ratings_str = '\n'.join([
                f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
                for _, row in main_user_ratings.iterrows()
            ])

            similar_users_idx = np.argsort(-user_pcc_matrix[user_idx])[:num_similar_users + 1]
            similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

            similar_users_ratings = ""
            for idx in similar_users_idx:
                similar_user_id = unique_users[idx]
                similar_user_data = data[data[user_column_name] == similar_user_id]

                # Find top-rated items by this similar user, sorted by item PCC
                similar_items_indices = np.argsort(-item_pcc_matrix[random_movie_index, :])
                top_rated_items = similar_user_data[similar_user_data[movie_id_column].isin(unique_items[similar_items_indices])]

                # Extract top ratings from this user
                top_ratings = top_rated_items.nlargest(num_ratings_per_user, rating_column_name)
                for top_rating_row in top_ratings.itertuples():
                    item_id = getattr(top_rating_row, movie_id_column)
                    rating = getattr(top_rating_row, rating_column_name)
                    item_title = data.loc[data[movie_id_column] == item_id, movie_column_name].iloc[0]  # Get the title of the item
                    similar_users_ratings += f"* Title: {item_title}, Rating: {rating} stars\n"

            combined_text = f"Title: {random_movie_title}"
            prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"

            predicted_rating = predict_rating_combined_ChatCompletion(
                combined_text, 
                approach="CF", 
                similar_users_ratings=similar_users_ratings,
                rating_history=main_user_ratings_str,
                system_content=system_content
            )

            results.append([user_id, random_movie_id, random_movie_title, actual_rating, predicted_rating])
            print(f"User {user_id}: Predicted rating for '{random_movie_title}' is {predicted_rating}.")

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")

    return results_df


In [None]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    save_path=CF_OUTPUT_PATH, 
    seed=RANDOM_STATE
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

# Semantic Similarity

In [None]:
@retry_decorator
def predict_rating_combined_ChatCompletion(combined_text, 
                                           model=GPT_MODEL_NAME, 
                                           temperature=TEMPERATURE, 
                                           approach="zero-shot", 
                                           rating_history=None, 
                                           similar_users_ratings=None, 
                                           seed=RANDOM_STATE, 
                                           system_content=AMAZON_CONTENT_SYSTEM):
    # Validation
    if approach == "few-shot" and rating_history is None:
        raise ValueError("Rating history is required for the few-shot approach.")
    if approach == "CF" and similar_users_ratings is None:
        raise ValueError("Similar users' ratings are required for the collaborative filtering approach.")
    if not system_content:
        raise ValueError("System content is required.")
    
    # Initialize prompt variable
    prompt = ""

    # Check and reduce length of combined_text
    combined_text = check_and_reduce_length(combined_text, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)

    # Construct the prompt based on the approach
    if approach == "few-shot":
        rating_history = check_and_reduce_length(rating_history, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)
        prompt += f"\n\nHere is user rating history:\n{rating_history}"
        prompt += f"\n\nBased on above rating history, please predict user's rating for the product {combined_text}, (1 being lowest and 5 being highest,The output should be like: (x stars, xx%), do not explain the reason.)"

    elif approach == "CF":
        rating_history = check_and_reduce_length(rating_history, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)
        prompt += f"\n\nHere is user rating history:\n{rating_history}"
        similar_users_ratings = check_and_reduce_length(similar_users_ratings, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)
        prompt += f"\n\nHere is the rating history from users who are similar to this user:\n{similar_users_ratings}"
        prompt += f"\n\nBased on above rating history and similar users' rating history, please predict user's rating for the product {combined_text}, (1 being lowest and 5 being highest,The output should be like: (x stars, xx%), do not explain the reason.)"
        
    else:
        prompt = f"How will user rate this product {combined_text}? (1 being lowest and 5 being highest) Attention! Just give me back the exact number as a result, and you don't need a lot of text."
        

    print(f"Constructed Prompt for {approach} approach:\n")
    print(f'The prompt:\n**********\n{prompt}\n**********\n')

    try:
        # Create the API call
        response = openai.ChatCompletion.create(
            model=model,
            temperature=temperature,
            max_tokens=MAX_TOKENS_CHAT_GPT,
            seed=seed,
            messages=[
                {"role": "system", "content": system_content},
                {"role": "user", "content": prompt}
            ]
        )
        # Extract the system fingerprint and print it
        system_fingerprint = response.get('system_fingerprint')
        print(f"\nSystem Fingerprint: {system_fingerprint}")
        # Extract and return the rating
        rating_text = response.choices[0].message['content'].strip()
        print(f'\nAPI call response: "{rating_text}"')
        extracted_rating = extract_numeric_rating(rating_text)
        print(f'Extracted rating: {extracted_rating}\n\n\n')
        print("----------------------------------------------------------------------------------")
        return extracted_rating  # A float
    
    except APIError as api_err:
        print(f"API Error occurred: {api_err}")
        return None, str(api_err)
    except RateLimitError as rate_err:
        print(f"Rate Limit Error occurred: {rate_err}")
        return None, str(rate_err)
    except Exception as e:
        print(f"Unexpected Error: {e}")
        return None, str(e)


In [None]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd
import random

# Load SimCSE model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

def compute_semantic_similarity(text1, text2):
    inputs = tokenizer([text1, text2], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
    return 1 - cosine(embeddings[0], embeddings[1])

def predict_ratings_semantic_similarity_CFfewshot_and_save(data, pcc_matrix, user_column_name='reviewerID', 
                                                           movie_column_name='title', movie_id_column='asin', 
                                                           rating_column_name='rating', num_ratings_per_user=NUM_RATINGS_PER_USER, 
                                                           num_similar_users=NUM_SIMILAR_USERS, num_main_user_ratings=NUM_MAIN_USER_RATINGS, 
                                                           save_path='cf_predictions.csv', seed=RANDOM_STATE,
                                                           system_content=AMAZON_CONTENT_SYSTEM):
    results = []
    unique_users = data[user_column_name].unique()
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}

    random.seed(seed)

    for user_id in unique_users:
        user_idx = user_id_to_index[user_id]
        main_user_data = data[data[user_column_name] == user_id]
        main_user_ratings = main_user_data.sample(n=num_main_user_ratings, random_state=seed)

        similar_users_idx = np.argsort(-pcc_matrix[user_idx])[:num_similar_users + 1]
        similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

        # Collect ratings from similar users
        similar_users_ratings = ""
        for similar_user_idx in similar_users_idx:
            similar_user_id = unique_users[similar_user_idx]
            similar_user_data = data[data[user_column_name] == similar_user_id]
            similar_user_ratings = similar_user_data.sample(n=num_ratings_per_user, random_state=seed)
            for _, rating_row in similar_user_ratings.iterrows():
                similar_users_ratings += f"* Title: {rating_row[movie_column_name]}, Rating: {rating_row[rating_column_name]} stars\n"

        print(f"Similar users' ratings for user {user_id}:\n{similar_users_ratings}")

        potential_movies_for_prediction = main_user_data[~main_user_data[movie_id_column].isin(main_user_ratings[movie_id_column])]
        if potential_movies_for_prediction.empty:
            continue

        random_movie_row = potential_movies_for_prediction.sample(n=1, random_state=seed).iloc[0]
        random_movie_title = random_movie_row[movie_column_name]
        actual_rating = random_movie_row[rating_column_name]

        # Compute semantic similarities
        similarities = []
        for _, row in main_user_ratings.iterrows():
            main_movie_title = row[movie_column_name]
            similarity = compute_semantic_similarity(main_movie_title, random_movie_title)
            similarities.append(similarity)
        
        average_similarity = np.mean(similarities)
        print(f"Average semantic similarity for '{random_movie_title}' with user's history: {average_similarity}")

        combined_text = f"Title: {random_movie_title}"

        predicted_rating = predict_rating_combined_ChatCompletion(
            combined_text, 
            approach="CF", 
            similar_users_ratings=similar_users_ratings,
            rating_history=main_user_ratings,
            system_content=system_content
        )

        results.append([user_id, random_movie_row[movie_id_column], random_movie_title, actual_rating, predicted_rating])

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")

    return results_df



In [None]:
%%time

# Call the prediction function
results_df = predict_ratings_with_CF_item_PCC_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    save_path=CF_OUTPUT_PATH, 
    seed=RANDOM_STATE
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

# FAll back to random

In [None]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd
import random


tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")


@retry_decorator
def predict_rating_combined_ChatCompletion(combined_text, 
                                           model=GPT_MODEL_NAME, 
                                           temperature=TEMPERATURE, 
                                           approach="zero-shot", 
                                           rating_history=None, 
                                           similar_users_ratings=None, 
                                           seed=RANDOM_STATE, 
                                           system_content=AMAZON_CONTENT_SYSTEM):
    # Validation
    if approach == "few-shot" and rating_history is None:
        raise ValueError("Rating history is required for the few-shot approach.")
    if approach == "CF" and similar_users_ratings is None:
        raise ValueError("Similar users' ratings are required for the collaborative filtering approach.")
    if not system_content:
        raise ValueError("System content is required.")
    
    # Initialize prompt variable
    prompt = ""

    # Check and reduce length of combined_text
    combined_text = check_and_reduce_length(combined_text, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)

    # Construct the prompt based on the approach
    if approach == "few-shot":
        rating_history = check_and_reduce_length(rating_history, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)
        prompt += f"\n\nHere is user rating history:\n{rating_history}"
        prompt += f"\n\nBased on above rating history, please predict user's rating for the product {combined_text}, (1 being lowest and 5 being highest,The output should be like: (x stars, xx%), do not explain the reason.)"

    elif approach == "CF":
        rating_history = check_and_reduce_length(rating_history, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)
        prompt += f"\n\nHere is user rating history:\n{rating_history}"
        similar_users_ratings = check_and_reduce_length(similar_users_ratings, MAX_TOKENS_CHAT_GPT // 3, TOKENIZER)
        prompt += f"\n\nHere is the rating history from users who are similar to this user:\n{similar_users_ratings}"
        prompt += f"\n\nBased on above rating history and similar users' rating history, please predict user's rating for the product {combined_text}, (1 being lowest and 5 being highest,The output should be like: (x stars, xx%), do not explain the reason.)"
        
    else:
        prompt = f"How will user rate this product {combined_text}? (1 being lowest and 5 being highest) Attention! Just give me back the exact number as a result, and you don't need a lot of text."
        

    print(f"Constructed Prompt for {approach} approach:\n")
    print(f'The prompt:\n**********\n{prompt}\n**********\n')

    try:
        # Create the API call
        response = openai.ChatCompletion.create(
            model=model,
            temperature=temperature,
            max_tokens=MAX_TOKENS_CHAT_GPT,
            seed=seed,
            messages=[
                {"role": "system", "content": system_content},
                {"role": "user", "content": prompt}
            ]
        )
        # Extract the system fingerprint and print it
        system_fingerprint = response.get('system_fingerprint')
        print(f"\nSystem Fingerprint: {system_fingerprint}")
        # Extract and return the rating
        rating_text = response.choices[0].message['content'].strip()
        print(f'\nAPI call response: "{rating_text}"')
        extracted_rating = extract_numeric_rating(rating_text)
        print(f'Extracted rating: {extracted_rating}\n\n\n')
        print("----------------------------------------------------------------------------------")
        return extracted_rating  # A float
    
    except APIError as api_err:
        print(f"API Error occurred: {api_err}")
        return None, str(api_err)
    except RateLimitError as rate_err:
        print(f"Rate Limit Error occurred: {rate_err}")
        return None, str(rate_err)
    except Exception as e:
        print(f"Unexpected Error: {e}")
        return None, str(e)

def predict_ratings_with_collaborative_filtering_and_save(data, pcc_matrix, 
                                                          user_column_name='reviewerID', 
                                                          movie_column_name='title', 
                                                          movie_id_column='asin',
                                                          rating_column_name='rating', 
                                                          num_ratings_per_user=NUM_RATINGS_PER_USER, 
                                                          num_similar_users=NUM_SIMILAR_USERS,
                                                          num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                          save_path='cf_predictions.csv', 
                                                          seed=RANDOM_STATE,
                                                          system_content=AMAZON_CONTENT_SYSTEM):
    results = []
    unique_users = data[user_column_name].unique()
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
    
    random.seed(seed)

    for user_id in unique_users:
        user_idx = user_id_to_index[user_id]

        print(f"Processing user {user_id} (Index: {user_idx})")

        # Retrieve the main user's historical ratings randomly
        main_user_data = data[data[user_column_name] == user_id]
        main_user_ratings = main_user_data.sample(n=num_main_user_ratings, random_state=seed)

        main_user_ratings_str = '\n'.join([
            f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
            for _, row in main_user_ratings.iterrows()
        ])
        print(f"Main user's historical ratings:\n{main_user_ratings_str}")

        # Find the top similar users based on Pearson Correlation Coefficient
        similar_users_idx = np.argsort(-pcc_matrix[user_idx])[:num_similar_users + 1]
        similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

        print(f"Top similar users for {user_id}: {[unique_users[idx] for idx in similar_users_idx]}")

        # Collect historical ratings from similar users randomly
        similar_users_ratings = ""
        for idx in similar_users_idx:
            similar_user_id = unique_users[idx]
            similar_user_data = data[data[user_column_name] == similar_user_id]
            historical_ratings = similar_user_data.sample(n=num_ratings_per_user, random_state=seed)
            for _, row in historical_ratings.iterrows():
                rating_info = f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
                similar_users_ratings += rating_info + "\n"
        print(f"Similar users' historical ratings:\n{similar_users_ratings}")
                
        # List of movie IDs already rated by the user
        rated_movie_ids = main_user_ratings[movie_id_column].tolist()

        # Exclude already rated movies and select a random movie for prediction
        potential_movies_for_prediction = main_user_data[~main_user_data[movie_id_column].isin(rated_movie_ids)]
        if potential_movies_for_prediction.empty:
            print(f"No unrated movies available for user {user_id} for prediction.")
            continue

        random_movie_row = potential_movies_for_prediction.sample(n=1, random_state=seed).iloc[0]
        random_movie_title = random_movie_row[movie_column_name]
        random_movie_id = random_movie_row[movie_id_column]
        actual_rating = random_movie_row[rating_column_name]
        print(f"Selected random movie '{random_movie_title}' for prediction.")

        # Construct prompt for API call
        combined_text = f"Title: {random_movie_title}"
        prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"

        print(f"Generated prompt for user {user_id}:\n{prompt}")

        predicted_rating = predict_rating_combined_ChatCompletion(
            combined_text, 
            approach="CF", 
            similar_users_ratings=similar_users_ratings,
            rating_history=main_user_ratings_str,
            system_content=system_content
        )

        # Store prediction results
        results.append([user_id, random_movie_id, random_movie_title, actual_rating, predicted_rating])

        print(f"User {user_id}: Predicted rating for '{random_movie_title}' is {predicted_rating}.")

    results_df = pd.DataFrame(results, columns=['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating'])
    results_df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")

    return results_df


In [None]:
%%time

# Call the prediction function
results_df = predict_ratings_with_CF_item_PCC_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    save_path=CF_OUTPUT_PATH, 
    seed=RANDOM_STATE
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

# First Iteration

In [None]:
%%time


cf_predictions = predict_ratings_with_CF_item_PCC_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       user_column_name=USER_COLUMN_NAME,
                                                                       movie_column_name=TITLE_COLUMN_NAME,
                                                                       movie_id_column=ITEM_ID_COLUMN,
                                                                       rating_column_name=RATING_COLUMN_NAME, 
                                                                       num_ratings_per_user=NUM_RATINGS_PER_USER,
                                                                       num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                                       num_similar_users=NUM_SIMILAR_USERS,)


In [None]:
# Read the data
saved_data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:
%%time


rerun_failed_CF_fewshot_predictions(data, 
                                    pcc_matrix, 
                                    save_path=CF_OUTPUT_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    num_similar_users=NUM_SIMILAR_USERS, 
                                    new_path=CF_RERUN_PATH,
                                    rerun_indices=rerun_indices)


In [None]:
rerun_data = pd.read_csv(CF_RERUN_PATH)
rerun_data.info()

In [None]:
CF_RERUN_PATH

In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

# Second iteration

In [None]:
%%time
# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user_2nd.dat')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user_2nd.dat')
print(f'Data path: {data_path}')




cf_predictions = predict_ratings_with_CF_item_PCC_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       user_column_name=USER_COLUMN_NAME,
                                                                       movie_column_name=TITLE_COLUMN_NAME,
                                                                       movie_id_column=ITEM_ID_COLUMN,
                                                                       rating_column_name=RATING_COLUMN_NAME, 
                                                                       num_ratings_per_user=NUM_RATINGS_PER_USER,
                                                                       num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                                       num_similar_users=NUM_SIMILAR_USERS, )


In [None]:
# Read the data
saved_data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:
%%time


rerun_failed_CF_fewshot_predictions(data, 
                                    pcc_matrix, 
                                    save_path=CF_OUTPUT_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    num_similar_users=NUM_SIMILAR_USERS, , 
                                    new_path=CF_RERUN_PATH,
                                    rerun_indices=rerun_indices)


# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)


# Third iteration

In [None]:
%%time
# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user_3rd.dat')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user_3rd.dat')
print(f'Data path: {data_path}')




cf_predictions = predict_ratings_with_CF_item_PCC_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       user_column_name=USER_COLUMN_NAME,
                                                                       movie_column_name=TITLE_COLUMN_NAME,
                                                                       movie_id_column=ITEM_ID_COLUMN,
                                                                       rating_column_name=RATING_COLUMN_NAME, 
                                                                       num_ratings_per_user=NUM_RATINGS_PER_USER,
                                                                       num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                                       num_similar_users=NUM_SIMILAR_USERS, )


In [None]:
# Read the data
saved_data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:
%%time


rerun_failed_CF_fewshot_predictions(data, 
                                    pcc_matrix, 
                                    save_path=CF_OUTPUT_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    num_similar_users=NUM_SIMILAR_USERS, , 
                                    new_path=CF_RERUN_PATH,
                                    rerun_indices=rerun_indices)


# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)


# Fourth iteration

In [None]:
%%time
# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user_4th.dat')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user_4th.dat')
print(f'Data path: {data_path}')




cf_predictions = predict_ratings_with_CF_item_PCC_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       user_column_name=USER_COLUMN_NAME,
                                                                       movie_column_name=TITLE_COLUMN_NAME,
                                                                       movie_id_column=ITEM_ID_COLUMN,
                                                                       rating_column_name=RATING_COLUMN_NAME, 
                                                                       num_ratings_per_user=NUM_RATINGS_PER_USER,
                                                                       num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                                       num_similar_users=NUM_SIMILAR_USERS, )


In [None]:
# Read the data
saved_data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:
%%time


rerun_failed_CF_fewshot_predictions(data, 
                                    pcc_matrix, 
                                    save_path=CF_OUTPUT_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    num_similar_users=NUM_SIMILAR_USERS,
                                    new_path=CF_RERUN_PATH,
                                    rerun_indices=rerun_indices)


# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)


# Fifth Iteration

In [None]:
%%time
# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user_5th.dat')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user_5th.dat')
print(f'Data path: {data_path}')




cf_predictions = predict_ratings_with_CF_item_PCC_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       user_column_name=USER_COLUMN_NAME,
                                                                       movie_column_name=TITLE_COLUMN_NAME,
                                                                       movie_id_column=ITEM_ID_COLUMN,
                                                                       rating_column_name=RATING_COLUMN_NAME, 
                                                                       num_ratings_per_user=NUM_RATINGS_PER_USER,
                                                                       num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                                       num_similar_users=NUM_SIMILAR_USERS, )


In [None]:
# Read the data
saved_data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:
%%time


rerun_failed_CF_fewshot_predictions(data, 
                                    pcc_matrix, 
                                    save_path=CF_OUTPUT_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    num_similar_users=NUM_SIMILAR_USERS,
                                    new_path=CF_RERUN_PATH,
                                    rerun_indices=rerun_indices)


# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)


# Sixth Iteration

In [None]:
%%time
# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user_6th.dat')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user_6th.dat')
print(f'Data path: {data_path}')


In [None]:
%%time
# output
CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'ml-1m/output/CF_fewshot_output_path_ratings_per_user_6th.dat')
print(f'Data path: {data_path}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'ml-1m/output/rerun_CF_fewshot_output_path_ratings_per_user_6th.dat')
print(f'Data path: {data_path}')




cf_predictions = predict_ratings_with_CF_item_PCC_and_save(data, 
                                                                       pcc_matrix, 
                                                                       save_path=CF_OUTPUT_PATH,
                                                                       user_column_name=USER_COLUMN_NAME,
                                                                       movie_column_name=TITLE_COLUMN_NAME,
                                                                       movie_id_column=ITEM_ID_COLUMN,
                                                                       rating_column_name=RATING_COLUMN_NAME, 
                                                                       num_ratings_per_user=NUM_RATINGS_PER_USER,
                                                                       num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                                                       num_similar_users=NUM_SIMILAR_USERS, )


In [None]:
# Read the data
saved_data = pd.read_csv(CF_OUTPUT_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:
%%time


rerun_failed_CF_fewshot_predictions(data, 
                                    pcc_matrix, 
                                    save_path=CF_OUTPUT_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    num_similar_users=NUM_SIMILAR_USERS,
                                    new_path=CF_RERUN_PATH,
                                    rerun_indices=rerun_indices)


# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)
