### Item-base Collaborative Filtering - Retrieval Evaluation
This notebook is used to simulate the performance of the item-based collaborative filtering algorithm in a retrieval setting. 

#### Pre-requisites
- The model is trained and the index is created in the notebook `ItemCF Model & Evaluation.ipynb`.
- The index is saved in the file `yelp_ItemCF.db` in the same directory as this notebook.

In [26]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [27]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [28]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [29]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

df_business = yelp_data["business"]
df_review = yelp_data["review"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# split the data into training and test sets
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [37]:
# Connect to the SQLite database
db_path = './yelp_ItemCF.db'
conn = sqlite3.connect(db_path)

In [38]:
# Function to get businesses a user interacted with
def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

In [39]:
# Function to get top-k similar businesses for a given business
def get_top_k_similar_businesses(business_id, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM item_item_similarity WHERE item_id = ?''', (business_id,))
    result = cursor.fetchone()

    if result is None:
        return []

    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector

    # Get top-k similar businesses
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]

    # Map indices to business ids
    similar_businesses = [(list(business_mapping.keys())[idx], score) for idx, score in top_k]

    return similar_businesses

In [40]:
# Function to predict user interests based on similar businesses
def predict_user_interests(user_id, k=10, conn=conn):
    user_businesses = get_user_businesses(user_id, conn)

    recommended_businesses = {}
    for business_id, _ in user_businesses:
        similar_businesses = get_top_k_similar_businesses(business_id, k, conn)

        for similar_business_id, score in similar_businesses:
            if similar_business_id in recommended_businesses:
                recommended_businesses[similar_business_id] += score
            else:
                recommended_businesses[similar_business_id] = score

    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])

    return recommended_businesses[:k]

In [41]:
# get the number of positive and negative reviews in the test data
positive_reviews = test_data[test_data['stars_review'] >= 4]
negative_reviews = test_data[test_data['stars_review'] <= 2]

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")


Number of positive reviews: 136473
Number of negative reviews: 32929
Total number of reviews: 197147
Ratio of positive to negative reviews: 4.14


In [42]:
def balance_test_data(positive_reviews, negative_reviews):
    # down-sample the positive reviews to balance the dataset
    positive_reviews_downsampled = positive_reviews.sample(n=len(negative_reviews), random_state=42)

    # combine the down-sampled positive reviews with the negative reviews
    balanced_test_data = pd.concat([positive_reviews_downsampled, negative_reviews], ignore_index=True)

    # shuffle the balanced test data
    balanced_test_data = balanced_test_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # new statistics for the balanced test data
    positive_reviews = balanced_test_data[balanced_test_data['stars_review'] >= 4]
    negative_reviews = balanced_test_data[balanced_test_data['stars_review'] <= 2]

    print(f"Number of positive reviews: {len(positive_reviews)}")
    print(f"Number of negative reviews: {len(negative_reviews)}")
    print(f"Total number of reviews: {len(balanced_test_data)}")
    print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")
    return balanced_test_data

In [51]:
# balance the test data, comment this line to use the original test data
test_data = balance_test_data(positive_reviews, negative_reviews)

# group the test data by user_id and get the business_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

# get the recommendations for each user in the test data
recommendations = {}

i = 0
for user_id in test_data_grouped['user_id']:
    recommendation = predict_user_interests(user_id, k=300, conn=conn)
    business_ids, scores = [], []
    for business_id, score in recommendation:
        business_ids.append(business_id)
        scores.append(score)
    recommendations[user_id] = (business_ids, scores) 
    i += 1
    # i is used to limit the number of recommendations to display
    if i == 1000:
        break



Number of positive reviews: 32929
Number of negative reviews: 32929
Total number of reviews: 65858
Ratio of positive to negative reviews: 1.00


In [52]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    # get the rank of the business_id in the recommendations
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [53]:
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)


In [55]:
# Calculate evaluation metrics
accuracy = (true_positive + true_negative) / float(total) if total > 0 else 0
precision = true_positive / float(true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / float(total_positive) if total_positive > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Mean Reciprocal Rank (MRR) - safer handling
mean_reciprocal_rank = np.mean([1 / rank for rank in ranks if rank is not None and rank > 0]) if ranks else 0

# Weighted Fβ-score
beta = 1.5
f_beta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall) if (beta**2 * precision + recall) > 0 else 0

# Compute dataset statistics
total_negative = total - total_positive if total > 0 else 0
background_stats = pd.DataFrame({
    'Total Positive': [total_positive],
    'Total Negative': [total_negative],
    'Total': [total],
    'Ratio': [total_positive / float(total) if total > 0 else 0],
})

print("Testing Data Statistics")
display(background_stats)

# Evaluation Metrics
evaluation_metric = pd.DataFrame({
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1_score],
    'F-beta Score': [f_beta],
    'Mean Reciprocal Rank': [mean_reciprocal_rank],
}).apply(lambda x: round(x, 4))

print("Evaluation Metrics")
display(evaluation_metric)

# Confusion Matrix
confusion_matrix = pd.DataFrame({
    'True Positive': [true_positive],
    'True Negative': [true_negative],
    'False Positive': [false_positive],
    'False Negative': [false_negative]
})

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,832,840,1672,0.497608


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.558,0.6332,0.2656,0.3743,0.3234,0.0719


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,221,712,128,611
