### Uniform Recommendation Baseline,
    This notebook implements a Popular Recommendation baseline model for the Yelp dataset. It recommends a set of uniform businesses to each user and evaluates the performance using retrieval metrics.
    
#### Pre-requisites,
- The Yelp dataset is loaded from the processed data folder (`../../data/processed_data/yelp_data/`).

In [1]:
import sys
sys.path.append('../')
from utilities import *

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)

In [4]:
# Merge business and review data on 'business_id'
df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))
# Select columns needed for recommendation
user_business = df_concat[["user_id", "business_id", "stars_review"]]

In [5]:
# Split data into training and test sets (80% train, 20% test)
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [6]:
# Get all unique business IDs from the training data
all_business_ids = train_data['business_id'].unique()

In [7]:
# Predefine a fixed uniform set of recommendations
# For simplicity, randomly select k items once and use this set for all users
k_recommendations = 300  # Match your ItemCF setup
np.random.seed(42)  # For reproducibility
uniform_set = np.random.choice(all_business_ids, size=k_recommendations, replace=False)
uniform_scores = np.ones(k_recommendations) / k_recommendations  # Equal probability scores

In [8]:
# Function to generate uniform set recommendations for a user
def predict_uniform_interests(user_id, k=k_recommendations, uniform_set=uniform_set, uniform_scores=uniform_scores):
    # Return the same fixed set of k business_ids with uniform scores
    k = min(k, len(uniform_set))
    recommended_businesses = uniform_set[:k]
    scores = uniform_scores[:k]
    return list(zip(recommended_businesses, scores))

In [9]:
positive_reviews = test_data[test_data['stars_review'] >= 4]
negative_reviews = test_data[test_data['stars_review'] <= 2]

In [10]:
print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 136473
Number of negative reviews: 32929
Total number of reviews: 197147
Ratio of positive to negative reviews: 4.14


In [11]:
# Group test data by user_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

In [12]:
# Generate uniform set recommendations for each user
recommendations = {}
max_users = min(1000, len(test_data_grouped))  # Limit to 1000 users for efficiency

In [13]:
for i in range(max_users):
    user_id = test_data_grouped['user_id'].iloc[i]
    recommendation = predict_uniform_interests(user_id, k=k_recommendations)
    business_ids, scores = zip(*recommendation)
    recommendations[user_id] = (list(business_ids), list(scores))

In [14]:
# Function to evaluate recommendations
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & 
                                       (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [15]:
# Evaluate the uniform set recommendations
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)

In [16]:
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [17]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1821,813,2634,0.691344


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.3098,0.6364,0.0038,0.0076,0.0048,0.0172


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,7,809,4,1814


In [18]:
# Calculate a relevance score (e.g., fraction of positive reviews) for each business
relevance = train_data.groupby('business_id').apply(
    lambda x: len(x[x['stars_review'] >= 4]) / len(x)
).reset_index(name='positive_ratio')

  relevance = train_data.groupby('business_id').apply(


In [19]:
# Filter businesses with a high positive ratio (e.g., > 0.5)
relevant_businesses = relevance[relevance['positive_ratio'] > 0.5]['business_id'].values

In [20]:
# Predefine an optimized uniform set from relevant businesses
k_recommendations = 300
np.random.seed(42)
optimized_uniform_set = np.random.choice(relevant_businesses, size=min(k_recommendations, len(relevant_businesses)), replace=False)
optimized_uniform_scores = np.ones(len(optimized_uniform_set)) / len(optimized_uniform_set)

In [21]:
# Modified function for optimized uniform recommendations
def predict_optimized_uniform_interests(user_id, k=k_recommendations, uniform_set=optimized_uniform_set, uniform_scores=optimized_uniform_scores):
    k = min(k, len(uniform_set))
    recommended_businesses = uniform_set[:k]
    scores = uniform_scores[:k]
    return list(zip(recommended_businesses, scores))

In [22]:
# Replace the original recommendation generation with this optimized version
recommendations = {}
for i in range(max_users):
    user_id = test_data_grouped['user_id'].iloc[i]
    recommendation = predict_optimized_uniform_interests(user_id, k=k_recommendations)
    business_ids, scores = zip(*recommendation)
    recommendations[user_id] = (list(business_ids), list(scores))

In [23]:
# Evaluate the uniform set recommendations
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)

In [24]:
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [25]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1821,813,2634,0.691344


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.3132,0.8333,0.0082,0.0163,0.0103,0.0416


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,15,810,3,1806
