### User-base Collaborative Filtering - Testing
This notebook is used to simulate the performance of the user-based collaborative filtering algorithm in a retrieval setting and in a prediction setting. 

* In the retrievak stage, it is modified to filter out low-similarity users and supplement the similar user list with users from the same cluster 

#### Pre-requisites
- The model is trained and the index is created in the notebook `UserCF Model & Index.ipynb`.
- The index is saved in the file `yelp_UserCF.db` in the same directory as this notebook.

In [1]:
# Import utilities and dependencies
import sys
sys.path.append('../')
from utilities import *
import sqlite3
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import os
import openpyxl as py
import matplotlib.pyplot as plt

In [2]:
# Load Yelp data
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']
yelp_data = load_data_from_db(db_folder, data_files)
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
# Prepare data
df_business = yelp_data["business"]
df_review = yelp_data["review"]
user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)

In [4]:
# Split into train (80%) and test (20%)
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [5]:
# Balance test data (50% positive)
test_data = balance_test_data(test_data)
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

Number of positive reviews: 136473
Number of negative reviews: 59624
Total number of reviews: 197147
Ratio of positive to negative reviews: 2.29
Number of positive reviews: 59624
Number of negative reviews: 59624
Total number of reviews: 119248
Ratio of positive to negative reviews: 1.00


# Retrieval

In [6]:
# Load cluster data from Excel with error handling
cluster_file = 'clustered_users.xlsx'  # Adjust path as needed
cluster_df = pd.read_excel(cluster_file)
user_to_cluster = dict(zip(cluster_df['user_id'], cluster_df['cluster']))

In [7]:
# Connect to UserCF database
db_path = './yelp_UserCF.db'  # Using cosine similarity-based DB
conn = sqlite3.connect(db_path)

In [8]:
# Retrieve user mapping from database
def get_top_k_similar_users_no_threshold(user_id, user_mapping, user_to_cluster, k_users, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM user_user_similarity WHERE user_id = ?''', (user_id,))
    result = cursor.fetchone()
    
    idx_to_user = {v: k for k, v in user_mapping.items()}
    if result is None:
        similar_users = []
    else:
        similarity_vector = pickle.loads(result[0])
        indices, data = similarity_vector
        # Take all valid users, no threshold filtering
        similar_users = [(idx_to_user.get(idx, "Unknown"), score) for idx, score in zip(indices, data)]
        similar_users = sorted(similar_users, key=lambda x: -x[1])  # Sort by similarity
    
    valid_sim_users = [(u, s) for u, s in similar_users if u != "Unknown"]
    num_similar = len(valid_sim_users)
    
    if num_similar >= k_users:
        # Take top k_users from similarity vector
        return valid_sim_users[:k_users]
    else:
        # Supplement with cluster users
        if user_id not in user_to_cluster:
            return valid_sim_users[:k_users]  # Return what we have if no cluster data
        target_cluster = user_to_cluster[user_id]
        cluster_users = cluster_df[cluster_df['cluster'] == target_cluster]['user_id'].tolist()
        cluster_users = [u for u in cluster_users if u != user_id and u not in [su[0] for su in valid_sim_users]]
        
        # Add cluster users to reach k_users
        needed = k_users - num_similar
        additional_users = [(u, 0) for u in cluster_users[:needed] if u in user_mapping]
        return valid_sim_users + additional_users[:k_users - num_similar]  # Ensure exactly k_users

def predict_user_interests_usercf_traink(user_id, user_mapping, user_to_cluster, conn, k_users=300, k_items=300):
    similar_users = get_top_k_similar_users_no_threshold(user_id, user_mapping, user_to_cluster, k_users, conn)
    recommended_businesses = {}
    for similar_user_id, _ in similar_users:
        if similar_user_id == "Unknown":
            continue
        similar_user_businesses = get_user_businesses(similar_user_id, conn)
        for business_id, score in similar_user_businesses:
            if business_id in recommended_businesses:
                recommended_businesses[business_id] += score
            else:
                recommended_businesses[business_id] = score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])
    return recommended_businesses[:k_items]

def simulate_recommendations_usercf_traink(test_data_grouped, user_mapping, user_to_cluster, conn, k_users=300, k_items=300, num_users=1000):
    recommendations = {}
    for i, user_id in enumerate(test_data_grouped['user_id']):
        if i >= num_users:
            break
        recommendation = predict_user_interests_usercf_traink(user_id, user_mapping, user_to_cluster, conn, k_users, k_items)
        business_ids, scores = zip(*recommendation) if recommendation else ([], [])
        recommendations[user_id] = (list(business_ids), list(scores))
    return recommendations

In [9]:
# Evaluate retrieval
retrieval_recommendations = simulate_recommendations_usercf_traink(test_data_grouped, user_mapping, user_to_cluster, conn, 
                k_users=550, k_items=550, num_users=1000)
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_retrieval_recommendations(retrieval_recommendations, test_data, test_data_grouped)
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [10]:
print("Testing Data Statistics")
display(background_stats)
print("Retrieval Evaluation Metrics")
display(evaluation_metric)
print("Retrieval Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1045,1083,2128,0.491071


Retrieval Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.5451,0.5593,0.3474,0.4286,0.3759,0.0501


Retrieval Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,363,797,286,682


# Prediction

In [11]:
# Predict rating for a specific business (mirroring ItemCF)
def get_business_interest_usercf(user_id, business_id, user_mapping, user_to_cluster, conn, k=100, similarity_threshold=0):
    user_businesses = get_user_businesses(user_id, conn)
    if not user_businesses:
        return 0  # No interactions
    similar_users = get_top_k_similar_users_no_threshold(user_id, user_mapping, user_to_cluster, k, conn)
    if not similar_users:
        return -1  # No similar users
    weighted_sum = 0.0
    similarity_sum = 0.0
    for similar_user_id, similarity in similar_users:
        if similar_user_id == "Unknown":
            continue
        cursor = conn.cursor()
        cursor.execute('SELECT stars_review FROM user_item_index WHERE user_id = ? AND business_id = ?', 
                       (similar_user_id, business_id))
        result = cursor.fetchone()
        if result:
            rating = result[0]
            weighted_sum += similarity * rating
            similarity_sum += similarity
    if similarity_sum == 0:
        return -1  # No ratings from similar users
    return weighted_sum / similarity_sum

In [12]:
# Predict ratings for test pairs (identical to ItemCF structure)
def predict_recommendations_usercf(test_data, test_data_grouped, user_mapping, user_to_cluster, conn, pos=4, k=100, similarity_threshold=0):
    predicted_labels = []
    actual_labels = []
    unrated_count = 0
    positive_count = 0
    negative_count = 0
    null_count = 0
    k_users = min(1000, len(test_data_grouped))
    test_data_dict = {
        (row['user_id'], row['business_id']): row['stars_review']
        for _, row in test_data.iterrows()
    }
    for i in range(k_users):
        user_id = test_data_grouped['user_id'].iloc[i]
        business_ids = test_data_grouped['business_id'].iloc[i]
        for business_id in business_ids:
            predicted_rating = get_business_interest_usercf(user_id, business_id, user_mapping, user_to_cluster, conn, k, similarity_threshold)
            actual_rating = test_data_dict.get((user_id, business_id), None)
            if actual_rating is None:
                null_count += 1
            else:
                if actual_rating >= pos:
                    positive_count += 1
                else:
                    negative_count += 1
            if actual_rating is None or predicted_rating == -1:
                unrated_count += 1
                continue
            predicted_labels.append(predicted_rating >= pos)
            actual_labels.append(actual_rating >= pos)
    predicted_labels = np.array(predicted_labels, dtype=np.int8)
    actual_labels = np.array(actual_labels, dtype=np.int8)
    return predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count

In [13]:
# Evaluate prediction with initial threshold
predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count = predict_recommendations_usercf(test_data, test_data_grouped, user_mapping, user_to_cluster, conn, pos=4, k=100, similarity_threshold=0.1)
prediction_lst = [positive_count, negative_count, unrated_count]
evaluation_metric, confusion_matrix, background_stats = compute_prediction_evaluation(actual_labels, predicted_labels, prediction_lst, beta=2)

In [14]:
print("Testing Data Statistics")
display(background_stats)
if 'Mean Reciprocal Rank' in evaluation_metric.columns:
    evaluation_metric.drop(columns=['Mean Reciprocal Rank'], inplace=True)
print("Prediction Evaluation Metrics")
display(evaluation_metric)
print("Prediction Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1043,1085,2128,0.490132


Prediction Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Unrated Count
0,0.5671,0.5712,0.372937,0.451252,0.400758,0.422462


Prediction Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,389,308,292,240
