### Popular Recommendation Baseline,
    This notebook implements a Popular Recommendation baseline model for the Yelp dataset. It recommends a set of popular businesses to each user and evaluates the performance using retrieval metrics.
    
#### Pre-requisites,
- The Yelp dataset is loaded from the processed data folder (`../../data/processed_data/yelp_data/`).

In [1]:
import sys
sys.path.append('../')
from utilities import *

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)

In [4]:
# Merge business and review data on 'business_id'
df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))
# Select columns needed for recommendation
user_business = df_concat[["user_id", "business_id", "stars_review"]]

In [5]:
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [6]:
# Calculate popularity based on weighted positive reviews
# Compute the number of positive reviews (stars >= 4) and total reviews per business
popularity = train_data.groupby('business_id').agg(
    positive_reviews=('stars_review', lambda x: (x >= 4).sum()),  # Count of positive reviews
    total_reviews=('stars_review', 'count')  # Total review count
).reset_index()

# Calculate a weighted popularity score: positive_reviews / total_reviews
# This gives higher weight to businesses with a higher proportion of positive reviews
# Add a small constant (e.g., 1) to total_reviews to avoid division by zero (though unlikely here)
popularity['weighted_score'] = popularity['positive_reviews'] / (popularity['total_reviews'] + 1)
# Optionally, multiply by total_reviews to favor businesses with more reviews (uncomment if desired)
# popularity['weighted_score'] = (popularity['positive_reviews'] / (popularity['total_reviews'] + 1)) * np.log1p(popularity['total_reviews'])

# Sort by weighted score in descending order
popular_items = popularity.sort_values(by='weighted_score', ascending=False)
popular_business_ids = popular_items['business_id'].values
popularity_scores = popular_items['weighted_score'].values

In [7]:
# Function to generate popular item recommendations for a user
def predict_popular_interests(user_id, k=10, popular_business_ids=popular_business_ids, popularity_scores=popularity_scores):
    # Return the top k popular business_ids with their weighted scores
    k = min(k, len(popular_business_ids))
    recommended_businesses = popular_business_ids[:k]
    scores = popularity_scores[:k]
    return list(zip(recommended_businesses, scores))

In [8]:
# Calculate test data statistics for reference
positive_reviews = test_data[test_data['stars_review'] >= 4]  # Positive: 4 or 5 stars
negative_reviews = test_data[test_data['stars_review'] < 4]  # Negative: 1 or 2 stars

In [9]:
print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 136473
Number of negative reviews: 59624
Total number of reviews: 197147
Ratio of positive to negative reviews: 2.29


In [10]:
# Group test data by user_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

In [11]:
# Generate popular item recommendations for each user
recommendations = {}
k_recommendations = 300  # Number of recommendations per user
max_users = min(1000, len(test_data_grouped))  # Limit to 1000 users for efficiency

for i in range(max_users):
    user_id = test_data_grouped['user_id'].iloc[i]
    recommendation = predict_popular_interests(user_id, k=k_recommendations)
    business_ids, scores = zip(*recommendation)
    recommendations[user_id] = (list(business_ids), list(scores))

In [12]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & 
                                       (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [13]:
# Evaluate the popular item recommendations
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)

In [14]:
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [15]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1821,813,2634,0.691344


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.3166,0.8889,0.0132,0.026,0.0164,0.0116


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,24,810,3,1797
