### User-base Collaborative Filtering - Prediction Evaluation
This notebook is used to evaluate the prediction performance of the user-based collaborative filtering model. It is different from the real time recommendation. 

#### Pre-requisites
- The model is trained and the index is created in the notebook `UserCF Model & Index.ipynb`.
- The index is saved in the file `yelp_UserCF.db` in the same directory as this notebook.

In [1]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import confusion_matrix

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [3]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [39]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

df_business = yelp_data["business"]
df_review = yelp_data["review"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}
user_mapping = {biz: idx for idx, biz in enumerate(user_business['user_id'].unique())}

# split the data into training and test sets
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [25]:
# Connect to the SQLite database
db_path = './yelp_UserCF.db'
conn = sqlite3.connect(db_path)

In [26]:
# Function to get businesses a user interacted with
def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    # Query to get business_id and stars for the specified user_id
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

In [37]:
# Function to get top-k similar users for a given user
def get_top_k_similar_users(user_id, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM user_user_similarity WHERE user_id = ?''', (user_id,))
    result = cursor.fetchone()

    if result is None:
        return []

    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector

    # Get top-k similar users
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]

    # Map indices to user ids
    similar_users = [(list(user_mapping.keys())[idx], score) for idx, score in top_k]

    return similar_users

In [28]:
def get_business_interest(user_id, business_id, conn):
    # Get the businesses the user has interacted with
    user_businesses = get_user_businesses(user_id, conn)

    if not user_businesses:
        return 0  # User has no previous interactions

    # Convert user_businesses to a dictionary for fast lookup
    user_ratings_dict = {biz_id: rating for biz_id, rating in user_businesses}

    # Compute the user's average rating
    user_avg_rating = np.mean(list(user_ratings_dict.values()))

    # Get top-K similar users
    similar_users = get_top_k_similar_users(user_id, k=100, conn=conn)

    weighted_sum = 0
    similarity_sum = 0

    for similar_user, similarity in similar_users:
        # Get ratings from the similar user
        similar_user_businesses = get_user_businesses(similar_user, conn)
        similar_user_ratings_dict = {biz_id: rating for biz_id, rating in similar_user_businesses}

        if business_id in similar_user_ratings_dict:
            rating = similar_user_ratings_dict[business_id]
            weighted_sum += similarity * (rating - np.mean(list(similar_user_ratings_dict.values())))
            similarity_sum += similarity

    # Return user_avg_rating if no similar user has rated the business
    if similarity_sum == 0:
        return user_avg_rating  # Returning avg rating instead of -1

    return user_avg_rating + (weighted_sum / similarity_sum)

In [29]:
# Function to predict user interests based on similar users
def predict_user_interests(user_id, k=10, conn=conn):
    # Get businesses the user has interacted with
    user_businesses = get_user_businesses(user_id, conn)

    recommended_businesses = {}

    # Get top-K similar users
    similar_users = get_top_k_similar_users(user_id, k, conn)

    for similar_user_id, similarity_score in similar_users:
        # Get businesses rated by the similar user
        similar_user_businesses = get_user_businesses(similar_user_id, conn)

        for business_id, rating in similar_user_businesses:
            # Only recommend businesses not already rated by the user
            if business_id not in [biz_id for biz_id, _ in user_businesses]:
                if business_id in recommended_businesses:
                    recommended_businesses[business_id] += similarity_score * rating
                else:
                    recommended_businesses[business_id] = similarity_score * rating

    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])

    return recommended_businesses[:k]

In [31]:
# Get the number of positive and negative reviews in the test data
positive_reviews = test_data[test_data['stars_review'] >= 4]  
negative_reviews = test_data[test_data['stars_review'] <= 2]  

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")

# Avoid division by zero

if len(negative_reviews) > 0:
    ratio = len(positive_reviews) / len(negative_reviews)
else:
    ratio = float('inf')  # or any other appropriate value or message

print(f"Ratio of positive to negative reviews: {ratio:.2f}")

Number of positive reviews: 136473
Number of negative reviews: 32929
Total number of reviews: 197147
Ratio of positive to negative reviews: 4.14


In [32]:
def balance_test_data(positive_reviews, negative_reviews):
    # Down-sample the positive reviews to balance the dataset
    positive_reviews_downsampled = positive_reviews.sample(n=len(negative_reviews), random_state=42)

    # Combine the down-sampled positive reviews with the negative reviews
    balanced_test_data = pd.concat([positive_reviews_downsampled, negative_reviews], ignore_index=True)

    # Shuffle the balanced test data
    balanced_test_data = balanced_test_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # New statistics for the balanced test data
    positive_reviews = balanced_test_data[balanced_test_data['stars_review'] >= 4]  
    negative_reviews = balanced_test_data[balanced_test_data['stars_review'] <= 2]  

    print(f"Number of positive reviews: {len(positive_reviews)}")
    print(f"Number of negative reviews: {len(negative_reviews)}")
    print(f"Total number of reviews: {len(balanced_test_data)}")
    
    # Avoid division by zero
    if len(negative_reviews) > 0:
        ratio = len(positive_reviews) / len(negative_reviews)
    else:
        ratio = float('inf')  # or another appropriate value/message

    print(f"Ratio of positive to negative reviews: {ratio:.2f}")
    return balanced_test_data

In [33]:
# balance the test data, comment this line to use the original test data
# test_data = balance_test_data(positive_reviews, negative_reviews)

# group the test data by user_id and get the business_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

In [34]:
# Initialize lists to store predictions and actual values
predicted_labels = []
actual_labels = []
unrated_count = 0  # Count how many times -1 is returned
positive_count = 0
negative_count = 0
null_count = 0
k = min(1000, len(test_data_grouped))  # Ensure k does not exceed available data

# Convert test_data into a dictionary for fast lookups
test_data_dict = {
    (row['user_id'], row['business_id']): row['stars_review']  
    for _, row in test_data.iterrows()
}

# Iterate over user-business pairs
for i in range(k):
    user_id = test_data_grouped['user_id'].iloc[i]
    business_ids = test_data_grouped['business_id'].iloc[i]

    for business_id in business_ids:
        predicted_rating = get_business_interest(user_id, business_id, conn)
        
        # Lookup actual rating using dictionary for O(1) access
        actual_rating = test_data_dict.get((user_id, business_id), None)
        
        if actual_rating is None:
            null_count += 1
        else:
            if actual_rating >= 4:
                positive_count += 1
            else:
                negative_count += 1
        
        if actual_rating is None or predicted_rating == -1:
            unrated_count += 1
            continue  # Skip evaluation for unrated items

        # Convert ratings to binary labels (positive: 1, negative: 0)
        predicted_labels.append(predicted_rating >= 4)
        actual_labels.append(actual_rating >= 4)

# Convert to NumPy arrays for potential vectorized operations later
predicted_labels = np.array(predicted_labels, dtype=np.int8)
actual_labels = np.array(actual_labels, dtype=np.int8)

In [35]:
# Ensure we don't divide by zero
if len(predicted_labels) > 0:
    accuracy = accuracy_score(actual_labels, predicted_labels)
    precision = precision_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)
    f1 = f1_score(actual_labels, predicted_labels)
    f_beta = fbeta_score(actual_labels, predicted_labels, beta=2)  # Adjust beta as needed
    
    tn, fp, fn, tp = confusion_matrix(actual_labels, predicted_labels).ravel()
else:
    accuracy = precision = recall = f1 = 0  # No valid predictions
    tn = fp = fn = tp = 0

Total = len(predicted_labels) + unrated_count

# Print results
print(f"Model Evaluation Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"F-beta (β=2): {f_beta:.4f}")
print(f"Unrated items (predicted -1): {unrated_count} ({unrated_count / Total:.2%})")

# Print confusion matrix values
print("\nConfusion Matrix Breakdown:")
print(f"True Positives (TP):  {tp}")   # Model correctly predicted positive
print(f"True Negatives (TN):  {tn}")   # Model correctly predicted negative
print(f"False Positives (FP): {fp}")   # Model incorrectly predicted positive
print(f"False Negatives (FN): {fn}")   # Model incorrectly predicted negative

print("\nReview Breakdown:")
print(f"Positive reviews: {positive_count} ({positive_count / Total:.2%})")
print(f"Negative reviews: {negative_count} ({negative_count / Total:.2%})")
print(f"Null reviews: {null_count} ({null_count / Total:.2%})")

Model Evaluation Metrics:
Accuracy:  0.5585
Precision: 0.8007
Recall:    0.4811
F1-score:  0.6010
F-beta (β=2): 0.5228
Unrated items (predicted -1): 0 (0.00%)

Confusion Matrix Breakdown:
True Positives (TP):  876
True Negatives (TN):  595
False Positives (FP): 218
False Negatives (FN): 945

Review Breakdown:
Positive reviews: 1821 (69.13%)
Negative reviews: 813 (30.87%)
Null reviews: 0 (0.00%)


In [36]:
conn.close()