# Cluster-based Collaborative Filtering - Testing
This script evaluates the performance of the cluster-based collaborative filtering algorithm in retrieval and prediction settings using the Yelp dataset and yelp_ClusterCF.db.

Pre-requisites:
- The model is trained and the index is created using ClusterCF_Model_Index.py.
- The index is saved in yelp_ClusterCF.db in the same directory as this script.
- The clustered_users.xlsx file is in ../../data/processed_data with user_id and cluster columns.  

Notes: This testing code does not include preventing recommended businesses overlapped with the user's interacted businesses. Therefore, the recommended businesses will be the same for users in the same cluster.

In [2]:
import sys
sys.path.append('../')
from utilities import *
import sqlite3
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# Load Yelp data
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']
yelp_data = load_data_from_db(db_folder, data_files)
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [4]:
# Prepare data
df_business = yelp_data["business"]
df_review = yelp_data["review"]

In [5]:
# Use get_user_business to prepare user-business interactions
user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)
print(f"Prepared {len(user_business)} user-business interactions.")

Prepared 985732 user-business interactions.


In [6]:
# Load cluster assignments
cluster_file = 'clustered_users.xlsx'
cluster_df = pd.read_excel(cluster_file)
user_to_cluster = pd.DataFrame({
    'user_id': cluster_df['user_id'],
    'cluster_id': cluster_df['cluster'].astype(str)  # Convert cluster_id to string
})
print(f"Loaded {len(cluster_df)} cluster assignments.")

Loaded 99812 cluster assignments.


In [7]:
# Filter users without cluster assignments
user_business = user_business[user_business['user_id'].isin(user_to_cluster['user_id'])]
print(f"Filtered user-business to {len(user_business)} interactions for clustered users.")

Filtered user-business to 918151 interactions for clustered users.


In [8]:
# Merge user-business interactions with cluster mappings
user_business_clusters = user_business.merge(user_to_cluster, on='user_id', how='inner')
print(f"Merged dataset contains {len(user_business_clusters)} user-business-cluster records.")

Merged dataset contains 918151 user-business-cluster records.


In [9]:
# Aggregate ratings at cluster level (mean of stars_review per cluster-business pair)
cluster_business = user_business_clusters.groupby(['cluster_id', 'business_id'])['stars_review'].mean().reset_index()
print(f"Aggregated to {len(cluster_business)} cluster-business interactions.")

Aggregated to 721327 cluster-business interactions.


In [10]:
# Split into train (80%) and test (20%) at cluster level
train_data, test_data = train_test_split(cluster_business, test_size=0.2, random_state=42)
test_data = balance_test_data(test_data)
test_data_grouped = test_data.groupby('cluster_id')['business_id'].apply(list).reset_index()
print(f"Train data: {len(train_data)} rows, Test data: {len(test_data)} rows")

Number of positive reviews: 98024
Number of negative reviews: 46242
Total number of reviews: 144266
Ratio of positive to negative reviews: 2.12
Number of positive reviews: 46242
Number of negative reviews: 46242
Total number of reviews: 92484
Ratio of positive to negative reviews: 1.00
Train data: 577061 rows, Test data: 92484 rows


In [11]:
# Connect to ClusterCF database
db_path = './yelp_ClusterCF.db'
conn = sqlite3.connect(db_path)

In [12]:
# Ensure database indexes
conn.execute('CREATE INDEX IF NOT EXISTS idx_cluster_item ON cluster_item_index(cluster_id, business_id)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_cluster_similarity ON cluster_cluster_similarity(cluster_id)')
conn.commit()

# Retrieval Evaluation

In [13]:
def retrieve_cluster_mapping(conn):
    cursor = conn.cursor()
    cursor.execute('SELECT cluster_id, cluster_idx FROM cluster_mapping')
    return {str(row[0]): row[1] for row in cursor.fetchall()}  # Ensure cluster_id is string

def get_cluster_businesses(cluster_id, conn):
    cursor = conn.cursor()
    cursor.execute('SELECT business_id, stars_review FROM cluster_item_index WHERE cluster_id = ?', (str(cluster_id),))
    return cursor.fetchall()

def get_top_k_similar_clusters(cluster_id, cluster_mapping, k, conn):
    if cluster_id is None or cluster_id not in cluster_mapping:
        return []
    cursor = conn.cursor()
    cursor.execute('SELECT similarity_vector FROM cluster_cluster_similarity WHERE cluster_id = ?', (str(cluster_id),))
    result = cursor.fetchone()
    if result is None:
        return []
    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]
    idx_to_cluster = {v: str(k) for k, v in cluster_mapping.items()}  # Ensure cluster_id is string
    similar_clusters = [(idx_to_cluster.get(idx, "Unknown"), score) for idx, score in top_k]
    return similar_clusters

def predict_cluster_interests_clustercf(cluster_id, user_id, user_interactions, user_to_cluster_dict, cluster_mapping, conn, k_clusters=10, k_items=300):
    # Skip if user_id is not in user_to_cluster or has no valid cluster
    if user_id not in user_to_cluster_dict or user_to_cluster_dict[user_id] != cluster_id or cluster_id is None:
        return []
    similar_clusters = get_top_k_similar_clusters(cluster_id, cluster_mapping, k_clusters, conn)
    recommended_businesses = {}
    # Get user's interacted businesses from precomputed dictionary
    user_business = user_interactions.get(user_id, set())
    for similar_cluster_id, similarity_score in similar_clusters:
        if similar_cluster_id == "Unknown":
            continue
        similar_cluster_businesses = get_cluster_businesses(similar_cluster_id, conn)
        for business_id, score in similar_cluster_businesses:
            # if business_id in user_business:
            #     continue  # Exclude user's interacted businesses
            if business_id in recommended_businesses:
                recommended_businesses[business_id] += score * similarity_score
            else:
                recommended_businesses[business_id] = score * similarity_score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])
    return recommended_businesses[:k_items]

def simulate_recommendations_clustercf(test_data_grouped, user_to_cluster, cluster_mapping, user_business, conn, k_clusters=10, k_items=300, num_clusters=1000):
    recommendations = {}
    total_clusters = min(num_clusters, len(test_data_grouped))
    # Precompute user interactions
    user_interactions = user_business.groupby('user_id')['business_id'].apply(set).to_dict()
    # Filter test_data_grouped to valid cluster_id
    valid_clusters = set(cluster_mapping.keys())
    test_data_grouped = test_data_grouped[test_data_grouped['cluster_id'].isin(valid_clusters)]
    user_to_cluster_dict = user_to_cluster.set_index('user_id')['cluster_id'].to_dict()
    for i, cluster_id in enumerate(test_data_grouped['cluster_id']):
        if i >= total_clusters:
            break
        user_id = user_to_cluster[user_to_cluster['cluster_id'] == cluster_id]['user_id'].iloc[0] if not user_to_cluster[user_to_cluster['cluster_id'] == cluster_id].empty else None
        if user_id is None:
            continue
        recommendation = predict_cluster_interests_clustercf(cluster_id, user_id, user_interactions, user_to_cluster_dict, cluster_mapping, conn, k_clusters, k_items)
        business_ids, scores = zip(*recommendation) if recommendation else ([], [])
        recommendations[cluster_id] = (list(business_ids), list(scores))
    return recommendations

In [14]:
def check_retrieval_recommendations(recommendations, test_data, test_data_grouped, key_column='cluster_id', pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        cluster_id = row[key_column]
        business_ids = row['business_id']
        rank = 0
        if cluster_id in recommendations:
            recommended_businesses = recommendations[cluster_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data[key_column] == cluster_id) & (test_data['business_id'] == business_id)]['stars_review'].values
                star_rating = star_rating[0] if len(star_rating) > 0 else 0
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
                total += 1
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [15]:
cluster_mapping = retrieve_cluster_mapping(conn)
retrieval_recommendations = simulate_recommendations_clustercf(test_data_grouped, user_to_cluster, cluster_mapping, user_business, conn, k_clusters=10, k_items=300, num_clusters=1000)
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_retrieval_recommendations(retrieval_recommendations, test_data, test_data_grouped)
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [16]:
print("Testing Data Statistics")
print(background_stats)
print("\nRetrieval Evaluation Metrics")
print(evaluation_metric)
print("\nRetrieval Confusion Matrix")
print(confusion_matrix)

Testing Data Statistics
   Total Positive  Total Negative  Total     Ratio
0           18437           18966  37403  0.492928

Retrieval Evaluation Metrics
   Accuracy  Precision  Recall  F1 Score  F-beta Score  Mean Reciprocal Rank
0    0.5149     0.5844  0.0546    0.0999        0.0667                0.0299

Retrieval Confusion Matrix
   True Positive  True Negative  False Positive  False Negative
0           1007          18250             716           17430


# Prediction Evaluation

In [17]:
# Predict rating for a specific business
def get_business_interest_clustercf(cluster_id, business_id, cluster_mapping, conn, k=10):
    """Predict rating for a specific business based on similar clusters."""
    if cluster_id is None or cluster_id not in cluster_mapping:
        return 0
    cluster_businesses = get_cluster_businesses(cluster_id, conn)
    if not cluster_businesses:
        return 0
    similar_clusters = get_top_k_similar_clusters(cluster_id, cluster_mapping, k, conn)
    if not similar_clusters:
        return -1
    weighted_sum = 0.0
    similarity_sum = 0.0
    for similar_cluster_id, similarity in similar_clusters:
        if similar_cluster_id == "Unknown":
            continue
        cursor = conn.cursor()
        cursor.execute('SELECT stars_review FROM cluster_item_index WHERE cluster_id = ? AND business_id = ?', 
                       (str(similar_cluster_id), business_id))
        result = cursor.fetchone()
        if result:
            rating = result[0]
            weighted_sum += similarity * rating
            similarity_sum += similarity
    if similarity_sum == 0:
        return -1
    return weighted_sum / similarity_sum

def predict_recommendations_clustercf(test_data, test_data_grouped, cluster_mapping, conn, pos=4):
    """Predict ratings for test cluster-business pairs."""
    predicted_labels = []
    actual_labels = []
    unrated_count = 0
    positive_count = 0
    negative_count = 0
    null_count = 0
    k = min(1000, len(test_data_grouped))
    test_data_dict = {
        (row['cluster_id'], row['business_id']): row['stars_review']
        for _, row in test_data.iterrows()
    }
    for i in range(k):
        cluster_id = test_data_grouped['cluster_id'].iloc[i]
        business_ids = test_data_grouped['business_id'].iloc[i]
        for business_id in business_ids:
            predicted_rating = get_business_interest_clustercf(cluster_id, business_id, cluster_mapping, conn, k=10)
            actual_rating = test_data_dict.get((cluster_id, business_id), None)
            if actual_rating is None:
                null_count += 1
                continue
            if actual_rating >= pos:
                positive_count += 1
            else:
                negative_count += 1
            if predicted_rating == -1:
                unrated_count += 1
                continue
            predicted_labels.append(predicted_rating >= pos)
            actual_labels.append(actual_rating >= pos)
    predicted_labels = np.array(predicted_labels, dtype=np.int8)
    actual_labels = np.array(actual_labels, dtype=np.int8)
    return predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count

In [18]:
predicted_labels, actual_labels, positive_count, negative_count, null_count, unrated_count = predict_recommendations_clustercf(test_data, test_data_grouped, cluster_mapping, conn, pos=4)
prediction_lst = [positive_count, negative_count, unrated_count]
evaluation_metric, confusion_matrix, background_stats = compute_prediction_evaluation(actual_labels, predicted_labels, prediction_lst, beta=2)

In [19]:
print("\nTesting Data Statistics")
print(background_stats)
print("\nPrediction Evaluation Metrics")
print(evaluation_metric)
print("\nPrediction Confusion Matrix")
print(confusion_matrix)


Testing Data Statistics
   Total Positive  Total Negative  Total     Ratio
0           18437           18966  37403  0.492928

Prediction Evaluation Metrics
   Accuracy  Precision    Recall  F1 Score  F-beta Score Mean Reciprocal Rank  \
0    0.6904      0.624  0.905936  0.738991      0.830856                 None   

   Unrated Count  
0       0.032832  

Prediction Confusion Matrix
   True Positive  True Negative  False Positive  False Negative
0          16703           8271           10063            1138


In [None]:
# Close connection
conn.close()