### User-base Collaborative Filtering - Testing
This notebook is used to simulate the performance of the user-based collaborative filtering algorithm in a retrieval setting and in a prediction setting. 

#### Pre-requisites
- The model is trained and the index is created in the notebook `UserCF Model & Index_1002.ipynb`.
- The index is saved in the file `yelp_UserCF.db` in the same directory as this notebook.

In [1]:
# Import utilities and dependencies
import sys
sys.path.append('../')
from utilities import *
import sqlite3
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load Yelp data
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']
yelp_data = load_data_from_db(db_folder, data_files)
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
# Assign dataframes
df_business = yelp_data["business"]
df_review = yelp_data["review"]

In [4]:
# Prepare user-business interaction data
def get_user_business_with_time(df_business, df_review):
    df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))
    user_business = df_concat[["user_id", "business_id", "stars_review"]]
    user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
    business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}    
    return user_mapping, business_mapping, user_business

In [5]:
# Get mappings and interaction data
user_mapping, business_mapping, user_business = get_user_business_with_time(df_business, df_review)

In [6]:
# Split into train (80%) and test (20%)
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [7]:
# Balance test data (50% positive)
test_data = balance_test_data(test_data)
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

Number of positive reviews: 136473
Number of negative reviews: 59624
Total number of reviews: 197147
Ratio of positive to negative reviews: 2.29
Number of positive reviews: 59624
Number of negative reviews: 59624
Total number of reviews: 119248
Ratio of positive to negative reviews: 1.00


In [8]:
# Connect to UserCF database
db_path = './yelp_UserCF.db'
conn = sqlite3.connect(db_path)

### Retrieval Functions (adapted for UserCF 1002)

In [9]:
# Retrieve user mapping from database
def retrieve_user_mapping(conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT user_id, user_idx FROM user_mapping''')
    user_mapping = {row[0]: row[1] for row in cursor.fetchall()}
    return user_mapping

# Get businesses rated by a user
def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

# Get top-k similar users
def get_top_k_similar_users(user_id, user_mapping, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM user_user_similarity WHERE user_id = ?''', (user_id,))
    result = cursor.fetchone()
    if result is None:
        return []
    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]
    idx_to_user = {v: k for k, v in user_mapping.items()}
    similar_users = [(idx_to_user.get(idx, "Unknown"), score) for idx, score in top_k]
    return similar_users

# Predict user interests for recommendation
def predict_user_interests_usercf(user_id, user_mapping, conn, k=300):
    similar_users = get_top_k_similar_users(user_id, user_mapping, k, conn)
    recommended_businesses = {}
    for similar_user_id, similarity_score in similar_users:
        if similar_user_id == "Unknown":
            continue
        similar_user_businesses = get_user_businesses(similar_user_id, conn)
        for business_id, score in similar_user_businesses:
            if business_id in recommended_businesses:
                recommended_businesses[business_id] += score * similarity_score
            else:
                recommended_businesses[business_id] = score * similarity_score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])
    return recommended_businesses[:k]

# Simulate recommendations for test users
def simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=300, num_users=1000):
    recommendations = {}
    for i, user_id in enumerate(test_data_grouped['user_id']):
        if i >= num_users:
            break
        recommendation = predict_user_interests_usercf(user_id, user_mapping, conn, k)
        business_ids, scores = zip(*recommendation) if recommendation else ([], [])
        recommendations[user_id] = (list(business_ids), list(scores))
    return recommendations

In [10]:
user_mapping = retrieve_user_mapping(conn)
retrieval_recommendations = simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=300, num_users=1000)
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_retrieval_recommendations(retrieval_recommendations, test_data, test_data_grouped)
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [11]:
print("Testing Data Statistics")
display(background_stats)
print("Retrieval Evaluation Metrics")
display(evaluation_metric)
print("Retrieval Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1045,1083,2128,0.491071


Retrieval Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.5296,0.5462,0.2488,0.3419,0.2792,0.0562


Retrieval Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,260,867,216,785


### Prediction Functions

In [12]:
# Predict rating for a specific business (mirroring ItemCF)
def get_business_interest_usercf(user_id, business_id, user_mapping, conn, k=100):
    user_businesses = get_user_businesses(user_id, conn)
    if not user_businesses:
        return 0  # Match ItemCF: no interactions
    similar_users = get_top_k_similar_users(user_id, user_mapping, k, conn)
    if not similar_users:
        return -1  # Match ItemCF: no similar entities
    weighted_sum = 0.0
    similarity_sum = 0.0
    for similar_user_id, similarity in similar_users:
        if similar_user_id == "Unknown":
            continue
        cursor = conn.cursor()
        cursor.execute('SELECT stars_review FROM user_item_index WHERE user_id = ? AND business_id = ?', 
                       (similar_user_id, business_id))
        result = cursor.fetchone()
        if result:
            rating = result[0]
            weighted_sum += similarity * rating
            similarity_sum += similarity
    if similarity_sum == 0:
        return -1  # Match ItemCF: no ratings from similar users
    return weighted_sum / similarity_sum

In [13]:
# Predict ratings for test pairs (identical to ItemCF structure)
def predict_recommendations_usercf(test_data, test_data_grouped, user_mapping, conn, pos=4):
    predicted_labels = []
    actual_labels = []
    unrated_count = 0
    positive_count = 0
    negative_count = 0
    null_count = 0
    k = min(1000, len(test_data_grouped))
    test_data_dict = {
        (row['user_id'], row['business_id']): row['stars_review']
        for _, row in test_data.iterrows()
    }
    for i in range(k):
        user_id = test_data_grouped['user_id'].iloc[i]
        business_ids = test_data_grouped['business_id'].iloc[i]
        for business_id in business_ids:
            predicted_rating = get_business_interest_usercf(user_id, business_id, user_mapping, conn, k=100)
            actual_rating = test_data_dict.get((user_id, business_id), None)
            if actual_rating is None:
                null_count += 1
            else:
                if actual_rating >= pos:
                    positive_count += 1
                else:
                    negative_count += 1
            if actual_rating is None or predicted_rating == -1:
                unrated_count += 1
                continue
            predicted_labels.append(predicted_rating >= pos)
            actual_labels.append(actual_rating >= pos)
    predicted_labels = np.array(predicted_labels, dtype=np.int8)
    actual_labels = np.array(actual_labels, dtype=np.int8)
    return predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count

In [14]:
# Evaluate prediction
predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count = predict_recommendations_usercf(test_data, test_data_grouped, user_mapping, conn)
prediction_lst = [positive_count, negative_count, unrated_count]
evaluation_metric, confusion_matrix, background_stats = compute_prediction_evaluation(actual_labels, predicted_labels, prediction_lst, beta=2)

In [15]:
print("Testing Data Statistics")
display(background_stats)
if 'Mean Reciprocal Rank' in evaluation_metric.columns:
    evaluation_metric.drop(columns=['Mean Reciprocal Rank'], inplace=True)
print("Prediction Evaluation Metrics")
display(evaluation_metric)
print("Prediction Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1043,1085,2128,0.490132


Prediction Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Unrated Count
0,0.5596,0.5691,0.371039,0.449206,0.398797,0.432331


Prediction Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,387,289,293,239


In [16]:
# Close the database connection
conn.close()