### User-base Collaborative Filtering - Testing
This notebook is used to simulate the performance of the user-based collaborative filtering algorithm in a retrieval setting and in a prediction setting. 

#### Pre-requisites
- The model is trained and the index is created in the notebook `UserCF Model & Index.ipynb`.
- The index is saved in the file `yelp_UserCF.db` in the same directory as this notebook.

In [1]:
# import the python file from ../utilities.py
import sys
sys.path.append('../')
from utilities import *

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)

In [4]:
# split the data into training and test sets
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [5]:
# balance the test data, comment this line to use the original test data
test_data = balance_test_data(test_data)

# group the test data by user_id and get the business_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

Number of positive reviews: 136473
Number of negative reviews: 59624
Total number of reviews: 197147
Ratio of positive to negative reviews: 2.29
Number of positive reviews: 59624
Number of negative reviews: 59624
Total number of reviews: 119248
Ratio of positive to negative reviews: 1.00


In [6]:
# Connect to the SQLite database
db_path = './yelp_UserCF.db'
conn = sqlite3.connect(db_path)

### Retrieval Evaluation

In [7]:
# Function to get top-k similar users for a given user
def get_top_k_similar_users(user_id, user_mapping, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM user_user_similarity WHERE user_id = ?''', (user_id,))
    result = cursor.fetchone()

    if result is None:
        return []

    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector

    # Get top-k similar users
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]
    
    # # Map indices to user ids
    # similar_users = [(list(user_mapping.keys())[idx], score) for idx, score in top_k]

    idx_to_user = {v: k for k, v in user_mapping.items()}  # Reverse mapping
    similar_users = [(idx_to_user.get(idx, "Unknown"), score) for idx, score in top_k]

    return similar_users

In [8]:
# Function to predict user interests based on similar users
def predict_user_interests_usercf(user_id, user_mapping, conn, k=300, test_businesses=None):
    # Get top-k similar users
    similar_users = get_top_k_similar_users(user_id, user_mapping, k, conn)  # More similar users

    recommended_businesses = {}

    # For each similar user, get their business interactions
    for similar_user_id, similarity_score in similar_users:

        similar_user_businesses = get_user_businesses(similar_user_id, conn)
        for business_id, score in similar_user_businesses:
            if test_businesses and business_id not in test_businesses:  # Debug: force overlap
                continue
            if business_id in recommended_businesses:
                recommended_businesses[business_id] += score
            else:
                recommended_businesses[business_id] = score

    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])   
    
    return recommended_businesses[:k]

In [9]:
def simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=300, num_users=10):
    # get the recommendations for each user in the test data
    recommendations = {}

    i = 0
    for user_id in test_data_grouped['user_id']:
        recommendation = predict_user_interests_usercf(user_id, user_mapping, conn, k)
        business_ids, scores = [], []
        for business_id, score in recommendation:
            business_ids.append(business_id)
            scores.append(score)
        recommendations[user_id] = (business_ids, scores) 
        i += 1
        # i is used to limit the number of recommendations to display
        if i == num_users:
            break
    return recommendations

In [10]:
# def simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=300, num_users=1000):
#     recommendations = {}
#     test_businesses = set(test_data['business_id'])  # Enable filter for debug
#     i = 0
#     for user_id in test_data_grouped['user_id'].unique():
#         recommendation = predict_user_interests_usercf(user_id, user_mapping, conn, k, test_businesses)
#         business_ids, scores = [], []
#         for business_id, score in recommendation:
#             business_ids.append(business_id)
#             scores.append(score)
#         recommendations[user_id] = (business_ids, scores)
#         i += 1
#         if i >= num_users:
#             break
#     return recommendations

In [11]:
retrieval_recommendations = simulate_recommendations_usercf(test_data_grouped, user_mapping, business_mapping, conn, k=300, num_users=1000)

true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_retrieval_recommendations(retrieval_recommendations, test_data, test_data_grouped)

evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [12]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1045,1083,2128,0.491071


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.5418,0.5714,0.2679,0.3648,0.2998,0.0568


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,280,873,210,765
