### User-base Collaborative Filtering - Testing
This notebook is used to simulate the performance of the user-based collaborative filtering algorithm in a retrieval setting and in a prediction setting. 

#### Pre-requisites
- The model is trained and the index is created in the notebook `UserCF Model & Index_1001.ipynb`.
- The index is saved in the file `yelp_UserCF.db` in the same directory as this notebook.

In [1]:
# Import utilities and dependencies
import sys
sys.path.append('../')
from utilities import *
import sqlite3
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

In [2]:
# Load Yelp data
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']
yelp_data = load_data_from_db(db_folder, data_files)
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
# Prepare data
df_business = yelp_data["business"]
df_review = yelp_data["review"]
user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)

In [4]:
# Split into train (80%) and test (20%)
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [5]:
# Balance test data (50% positive)
test_data = balance_test_data(test_data)
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

Number of positive reviews: 136473
Number of negative reviews: 59624
Total number of reviews: 197147
Ratio of positive to negative reviews: 2.29
Number of positive reviews: 59624
Number of negative reviews: 59624
Total number of reviews: 119248
Ratio of positive to negative reviews: 1.00


In [6]:
# Connect to UserCF database
db_path = './yelp_UserCF.db'
conn = sqlite3.connect(db_path)

In [7]:
### Retrieval Functions (from usercf_retrieval.ipynb)
def retrieve_user_user_mapping(conn):
    cursor = conn.cursor()
    # Fetch user mappings
    cursor.execute('''SELECT user_id, user_idx FROM user_mapping''')
    user_mapping = {row[0]: row[1] for row in cursor.fetchall()}

    return user_mapping

def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

def get_top_k_similar_users(user_id, user_mapping, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM user_user_similarity WHERE user_id = ?''', (user_id,))
    result = cursor.fetchone()
    if result is None:
        return []
    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector
    # Get top-k similar users
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]
    # # Map indices to user ids
    idx_to_user = {v: k for k, v in user_mapping.items()}  # Reverse mapping
    similar_users = [(idx_to_user.get(idx, "Unknown"), score) for idx, score in top_k]
    return similar_users

def predict_user_interests_usercf(user_id, user_mapping, conn, k=300):
    # Get top-k similar users
    similar_users = get_top_k_similar_users(user_id, user_mapping, k, conn)  # More similar users
    recommended_businesses = {}
    # For each similar user, get their business interactions
    for similar_user_id, similarity_score in similar_users:
        similar_user_businesses = get_user_businesses(similar_user_id, conn)
        for business_id, score in similar_user_businesses:
            # if test_businesses and business_id not in test_businesses:  # Debug: force overlap
            #     continue
            if business_id in recommended_businesses:
                recommended_businesses[business_id] += score
            else:
                recommended_businesses[business_id] = score
    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])   
    return recommended_businesses[:k]

def simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=300, num_users=1000):
    # get the recommendations for each user in the test data
    recommendations = {}
    i = 0
    for user_id in test_data_grouped['user_id']:
        recommendation = predict_user_interests_usercf(user_id, user_mapping, conn, k)
        business_ids, scores = [], []
        for business_id, score in recommendation:
            business_ids.append(business_id)
            scores.append(score)
        recommendations[user_id] = (business_ids, scores) 
        i += 1
        # i is used to limit the number of recommendations to display
        if i == num_users:
            break
    return recommendations

### Retrieval Evaluation

In [8]:
user_mapping = retrieve_user_user_mapping(conn)

In [20]:
retrieval_recommendations = simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=500, num_users=1000)

In [21]:
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_retrieval_recommendations(retrieval_recommendations, test_data, test_data_grouped)

evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [22]:
print("Testing Data Statistics")
display(background_stats)
print("Retrieval Evaluation Metrics")
display(evaluation_metric)
print("Retrieval Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1045,1083,2128,0.491071


Retrieval Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.5423,0.5568,0.333,0.4168,0.3621,0.0466


Retrieval Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,348,806,277,697


In [27]:
db_path = '../Result_Evaluation/Retrieval.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create a lookup for fast access to star ratings from the test data:
# This dictionary maps (user_id, business_id) to the star rating.
test_data_lookup = {
    (row['user_id'], row['business_id']): row['stars_review']
    for _, row in test_data.iterrows()
}

# Prepare bulk records for insertion into SQLite.
# Format: (model, user_id, business_id, real_label)
# Here we assume a positive review (real_label = 1) if stars >= 4, else negative (real_label = 0).
bulk_records = []
model_name = "UserCF"  # You can change this if needed

for user_id, recommended_businesses in retrieval_recommendations.items():
    for business_id in recommended_businesses[0]:
        # Check the star rating from the test data lookup
        star_rating = test_data_lookup.get((user_id, business_id))
        # Define the real label: 1 if rating is available and >= 4, otherwise 0.
        real_label = 1 if star_rating is not None and star_rating >= 4 else 0
        if real_label == 1:
            print(f"User {user_id} was recommended business {business_id} with a positive rating of {star_rating}.")
        bulk_records.append((model_name, user_id, business_id, real_label))

# Example: Now perform a bulk insert using SQLite's executemany.
# Make sure your 'recommendations' table has a UNIQUE constraint on (model, user_id, business_id) if needed.
cursor.executemany("""
    INSERT OR IGNORE INTO recommendations (model, user_id, business_id, real_label)
    VALUES (?, ?, ?, ?)
""", bulk_records)
conn.commit()

# Close the connection
conn.close()

User --XwFm4qERD6J5SX0JAsbg was recommended business MbNcVhRqpNPcvgFzWgaxSQ with a positive rating of 5.0.
User --u09WAjW741FdfkJXxNmg was recommended business QboMIy08NLnBbLXEsmnDHg with a positive rating of 4.0.
User -0H6Rm6dCi3pkFBC26HsoQ was recommended business KPCjtUqJHPukWvRJZs5SsA with a positive rating of 5.0.
User -0MIp6WKJ8QvGnYZQ5ETyg was recommended business j-qtdD55OLfSqfsWuQTDJg with a positive rating of 5.0.
User -13RX4Gy_F-zoLIenWAo-w was recommended business dGeXdSMah56gEHwZNaRQKA with a positive rating of 5.0.
User -1WbN1Qd-opw8u3uEqs2Kg was recommended business 1CJyrUUgy3NRGpR-DGfJOg with a positive rating of 4.0.
User -3Dzhux7DmA0Rj6P8PtQNA was recommended business EaqASiPkxV9OUkvsAp4ODg with a positive rating of 5.0.
User -3agoL-p87vZteiDzrz5og was recommended business KmIbyOoeMDQPwOVT66SlxQ with a positive rating of 5.0.
User -3s52C4zL_DHRK0ULG6qtg was recommended business 0QYWhij_YZ7Lyk9F6213Sg with a positive rating of 5.0.
User -3s52C4zL_DHRK0ULG6qtg was recom

### Prediction Evaluation

In [115]:
def get_business_interest_usercf(user_id, business_id, user_mapping, conn, k=100):
    """Predict rating for a specific business based on similar users, mirroring ItemCF."""
    # Get businesses rated by the user (same as ItemCF)
    user_businesses = get_user_businesses(user_id, conn)
    
    if not user_businesses:
        return 0  # Match ItemCF: return 0 if no interactions

    # Get top-k similar users (ItemCF uses k=100 for prediction)
    similar_users = get_top_k_similar_users(user_id, user_mapping, k, conn)
    
    if not similar_users:
        return -1  # Match ItemCF: no similar entities, no prediction

    weighted_sum = 0.0
    similarity_sum = 0.0

    # Iterate over similar users (mirrors ItemCF's similar businesses loop)
    for similar_user_id, similarity in similar_users:
        if similar_user_id == "Unknown":
            continue
        cursor = conn.cursor()
        cursor.execute('SELECT stars_review FROM user_item_index WHERE user_id = ? AND business_id = ?', 
                       (similar_user_id, business_id))
        result = cursor.fetchone()
        if result:
            rating = result[0]
            weighted_sum += similarity * rating  # No user average, pure weighted sum
            similarity_sum += similarity

    # Match ItemCF: return -1 if no similar users rated the target business
    if similarity_sum == 0:
        return -1
    return weighted_sum / similarity_sum

In [119]:
def predict_recommendations_usercf(test_data, test_data_grouped, user_mapping, conn, pos=4):
    """Predict ratings for test user-business pairs, identical to ItemCF structure."""
    predicted_labels = []
    actual_labels = []
    unrated_count = 0
    positive_count = 0
    negative_count = 0
    null_count = 0
    k = min(1000, len(test_data_grouped))  # Same as ItemCF

    test_data_dict = {
        (row['user_id'], row['business_id']): row['stars_review']
        for _, row in test_data.iterrows()
    }

    for i in range(k):
        user_id = test_data_grouped['user_id'].iloc[i]
        business_ids = test_data_grouped['business_id'].iloc[i]

        for business_id in business_ids:
            predicted_rating = get_business_interest_usercf(user_id, business_id, user_mapping, conn, k=100)
            
            actual_rating = test_data_dict.get((user_id, business_id), None)
            
            if actual_rating is None:
                null_count += 1
            else:
                if actual_rating >= pos:
                    positive_count += 1
                else:
                    negative_count += 1
            
            if actual_rating is None or predicted_rating == -1:
                unrated_count += 1
                continue  # Skip evaluation for unrated items, matching ItemCF

            predicted_labels.append(predicted_rating >= pos)
            actual_labels.append(actual_rating >= pos)

    predicted_labels = np.array(predicted_labels, dtype=np.int8)
    actual_labels = np.array(actual_labels, dtype=np.int8)
    return predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count

In [120]:
predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count = predict_recommendations_usercf(test_data, test_data_grouped, user_mapping, conn)
prediction_lst = [positive_count, negative_count, unrated_count]
evaluation_metric, confusion_matrix, background_stats = compute_prediction_evaluation(actual_labels, predicted_labels, prediction_lst, beta=2)

In [121]:
print("Testing Data Statistics")
display(background_stats)
if 'Mean Reciprocal Rank' in evaluation_metric.columns:
    evaluation_metric.drop(columns=['Mean Reciprocal Rank'], inplace=True)
print("Prediction Evaluation Metrics")
display(evaluation_metric)
print("Prediction Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1043,1085,2128,0.490132


Prediction Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Unrated Count
0,0.5489,0.6154,0.007668,0.015148,0.009556,0.913534


Prediction Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,8,93,5,78
