### Item-base Collaborative Filtering - Prediction Evaluation
This notebook is used to evaluate the prediction performance of the item-based collaborative filtering model. It is different from the real time recommendation. 

#### Pre-requisites
- The model is trained and the index is created in the notebook `ItemCF Model & Index.ipynb`.
- The index is saved in the file `yelp_ItemCF.db` in the same directory as this notebook.

In [42]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import confusion_matrix

In [43]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [44]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [45]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

df_business = yelp_data["business"]
df_review = yelp_data["review"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# split the data into training and test sets
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [46]:
# Connect to the SQLite database
db_path = './yelp_ItemCF.db'
conn = sqlite3.connect(db_path)

In [47]:
# Function to get businesses a user interacted with
def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

In [48]:
# Function to get top-k similar businesses for a given business
def get_top_k_similar_businesses(business_id, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM item_item_similarity WHERE item_id = ?''', (business_id,))
    result = cursor.fetchone()

    if result is None:
        return []

    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector

    # Get top-k similar businesses
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]

    # Map indices to business ids
    similar_businesses = [(list(business_mapping.keys())[idx], score) for idx, score in top_k]

    return similar_businesses

In [49]:
def get_business_interest(user_id, business_id, conn):
    user_businesses = get_user_businesses(user_id, conn)

    if not user_businesses:
        return 0  # User has no previous interactions

    # Convert user_businesses to a dictionary for fast lookup
    user_ratings_dict = {biz_id: rating for biz_id, rating in user_businesses}

    # Compute the user's average rating
    user_avg_rating = np.mean(list(user_ratings_dict.values()))

    # Get top-K similar businesses
    similar_businesses = get_top_k_similar_businesses(business_id, k=100, conn=conn)

    weighted_sum = 0
    similarity_sum = 0

    for similar_biz, similarity in similar_businesses:
        if similar_biz in user_ratings_dict:
            rating = user_ratings_dict[similar_biz]
            weighted_sum += similarity * (rating - user_avg_rating)
            similarity_sum += similarity

    # Return user_avg_rating if no similar business has been rated
    if similarity_sum == 0:
        # return user_avg_rating
        return -1
    return user_avg_rating + (weighted_sum / similarity_sum)

In [50]:
# Function to predict user interests based on similar businesses
def predict_user_interests(user_id, k=10, conn=conn):
    user_businesses = get_user_businesses(user_id, conn)

    recommended_businesses = {}
    for business_id, _ in user_businesses:
        similar_businesses = get_top_k_similar_businesses(business_id, k, conn)

        for similar_business_id, score in similar_businesses:
            if similar_business_id in recommended_businesses:
                recommended_businesses[similar_business_id] += score
            else:
                recommended_businesses[similar_business_id] = score

    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])

    return recommended_businesses[:k]

In [51]:
# get the number of positive and negative reviews in the test data
positive_reviews = test_data[test_data['stars_review'] >= 4]
negative_reviews = test_data[test_data['stars_review'] <= 2]

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")


Number of positive reviews: 136473
Number of negative reviews: 32929
Total number of reviews: 197147
Ratio of positive to negative reviews: 4.14


In [52]:
def balance_test_data(positive_reviews, negative_reviews):
    # down-sample the positive reviews to balance the dataset
    positive_reviews_downsampled = positive_reviews.sample(n=len(negative_reviews), random_state=42)

    # combine the down-sampled positive reviews with the negative reviews
    balanced_test_data = pd.concat([positive_reviews_downsampled, negative_reviews], ignore_index=True)

    # shuffle the balanced test data
    balanced_test_data = balanced_test_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # new statistics for the balanced test data
    positive_reviews = balanced_test_data[balanced_test_data['stars_review'] >= 4]
    negative_reviews = balanced_test_data[balanced_test_data['stars_review'] <= 2]

    print(f"Number of positive reviews: {len(positive_reviews)}")
    print(f"Number of negative reviews: {len(negative_reviews)}")
    print(f"Total number of reviews: {len(balanced_test_data)}")
    print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")
    return balanced_test_data

In [53]:
# balance the test data, comment this line to use the original test data
# test_data = balance_test_data(positive_reviews, negative_reviews)

# group the test data by user_id and get the business_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()


In [55]:
# optimized code (run time: 4m)
# Initialize lists to store predictions and actual values
predicted_labels = []
actual_labels = []
unrated_count = 0  # Count how many times -1 is returned
postivie_count = 0
negative_count = 0
null_count = 0
k = min(1000, len(test_data_grouped))  # Ensure k does not exceed available data

# Convert test_data into a dictionary for fast lookups
test_data_dict = {
    (row['user_id'], row['business_id']): row['stars_review']
    for _, row in test_data.iterrows()
}


# Iterate over user-business pairs
for i in range(k):
    user_id = test_data_grouped['user_id'].iloc[i]
    business_ids = test_data_grouped['business_id'].iloc[i]

    for business_id in business_ids:
        predicted_rating = get_business_interest(user_id, business_id, conn)
        
        # Lookup actual rating using dictionary for O(1) access
        actual_rating = test_data_dict.get((user_id, business_id), None)
        
        if actual_rating is None:
            null_count += 1
        else:
            if actual_rating >= 4:
                postivie_count += 1
            else:
                negative_count += 1
        
        if actual_rating is None or predicted_rating == -1:
            unrated_count += 1
            continue  # Skip evaluation for unrated items

        # Convert ratings to binary labels (positive: 1, negative: 0)
        predicted_labels.append(predicted_rating >= 4)
        actual_labels.append(actual_rating >= 4)

# Convert to NumPy arrays for potential vectorized operations later
predicted_labels = np.array(predicted_labels, dtype=np.int8)
actual_labels = np.array(actual_labels, dtype=np.int8)


In [57]:

# Ensure we don't divide by zero
if len(predicted_labels) > 0:
    accuracy = accuracy_score(actual_labels, predicted_labels)
    precision = precision_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)
    f1 = f1_score(actual_labels, predicted_labels)
    f_beta = fbeta_score(actual_labels, predicted_labels, beta=2)  # Adjust beta as needed
    
    tn, fp, fn, tp = confusion_matrix(actual_labels, predicted_labels).ravel()
else:
    accuracy = precision = recall = f1 = 0  # No valid predictions
    tn = fp = fn = tp = 0
    
Total = len(predicted_labels) + unrated_count
# Print results
print(f"Model Evaluation Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"F-beta (β=2): {f_beta:.4f}")
print(f"Unrated items (predicted -1): {unrated_count} ({unrated_count / Total:.2%})")

# Print confusion matrix values
print("\nConfusion Matrix Breakdown:")
print(f"True Positives (TP):  {tp}")   # Model correctly predicted positive
print(f"True Negatives (TN):  {tn}")   # Model correctly predicted negative
print(f"False Positives (FP): {fp}")   # Model incorrectly predicted positive
print(f"False Negatives (FN): {fn}")   # Model incorrectly predicted negative

print("\nReview Breakdown:")
print(f"Positive reviews: {postivie_count} ({postivie_count / Total:.2%})")
print(f"Negative reviews: {negative_count} ({negative_count / Total:.2%})")
print(f"Null reviews: {null_count} ({null_count / Total:.2%})")

Model Evaluation Metrics:
Accuracy:  0.9104
Precision: 0.9643
Recall:    0.9039
F1-score:  0.9331
F-beta (β=2): 0.9154
Unrated items (predicted -1): 1 (0.04%)

Confusion Matrix Breakdown:
True Positives (TP):  1646
True Negatives (TN):  751
False Positives (FP): 61
False Negatives (FN): 175

Review Breakdown:
Positive reviews: 1821 (69.13%)
Negative reviews: 813 (30.87%)
Null reviews: 0 (0.00%)


In [41]:
conn.close()