### Uniform Recommendation Baseline,
    This notebook implements a Uniform Recommendation baseline model for the Yelp dataset. It divides users into clusters, recommends a set of uniform businesses to each cluster and evaluates the performance using retrieval metrics.
    
#### Pre-requisites,
- The Yelp dataset is loaded from the processed data folder (`../../data/processed_data/yelp_data/`).

In [1]:
import sys
sys.path.append('../')
from utilities import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from hdbscan import HDBSCAN
from IPython.display import display

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
df_business = yelp_data['business'] # Business details
df_categories = yelp_data['categories'] # Business categories
df_review = yelp_data['review'] # Review data

In [4]:
df_review['label'] = df_review['stars'].apply(lambda x: 1 if x > 4 else 0)

In [5]:
# Merge review and business data
df_review_merged = df_review.merge(df_business, on='business_id', how='left', suffixes=('_review', '_business'))

In [6]:
# Prepare category features (top 5 categories as binary indicators)
top_categories = df_categories['category'].value_counts().index[:5]
for cat in top_categories:
    df_review_merged[f'category_{cat}'] = df_review_merged['business_id'].isin(
        df_categories[df_categories['category'] == cat]['business_id']
    ).astype(int)

In [7]:
# Define features for clustering with corrected column names
features = ['stars_review', 'useful', 'funny', 'cool', 'latitude', 'longitude', 
            'stars_business', 'review_count'] + [f'category_{cat}' for cat in top_categories]

In [8]:
# Aggregate features at the user level (mean values across all reviews per user)
user_features = df_review_merged.groupby('user_id')[features].mean().reset_index()
print(f"Number of unique users: {len(user_features)}")

Number of unique users: 162079


In [13]:
# Split data into training and test sets
train_data, test_data = train_test_split(df_review_merged, test_size=0.2, random_state=42)
print(f"Number of reviews in train_data: {len(train_data)}")

Number of reviews in train_data: 784334


In [17]:
# Prepare feature matrix for clustering (using training data)
X = user_features[features].fillna(0).values  # Fill NA with 0 for simplicity
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Shape of X (user features): {X.shape}")

Shape of X (user features): (162079, 13)


In [18]:
# Apply PCA (10 components explaining 90% variance)
pca = PCA(n_components=10)
X_pca_reduced = pca.fit_transform(X_scaled)
print(f"Shape of X_pca_reduced: {X_pca_reduced.shape}")

Shape of X_pca_reduced: (162079, 10)


In [19]:
# Perform HDBSCAN clustering on users
hdbscan = HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_epsilon=0.3, metric='euclidean')
cluster_labels = hdbscan.fit_predict(X_pca_reduced.astype(np.float32))
print(f"Length of cluster_labels: {len(cluster_labels)}")

user_features['cluster'] = cluster_labels



Length of cluster_labels: 162079


In [20]:
# Merge cluster labels back to train_data and test_data
train_data = train_data.merge(user_features[['user_id', 'cluster']], on='user_id', how='left')
test_data = test_data.merge(user_features[['user_id', 'cluster']], on='user_id', how='left')

In [21]:
# Define fixed recommendation sets per cluster
k_recommendations = 300  # Number of recommendations per cluster
all_business_ids = train_data['business_id'].unique()
cluster_recommendations = {}
np.random.seed(42)

In [22]:
for cluster_id in set(cluster_labels):
    if cluster_id == -1:  # Skip noise points for now
        continue
    # Get reviews for users in this cluster
    cluster_data = train_data[train_data['cluster'] == cluster_id]
    # Calculate popularity within the cluster (weighted by positive reviews)
    cluster_popularity = cluster_data[cluster_data['stars_review'] >= 4].groupby('business_id').agg(
        positive_reviews=('stars_review', 'count'),
        total_reviews=('stars_review', 'size')
    ).reset_index()
    cluster_popularity['weighted_score'] = cluster_popularity['positive_reviews'] / (cluster_popularity['total_reviews'] + 1)
    cluster_popularity = cluster_popularity.sort_values(by='weighted_score', ascending=False)
    top_businesses = cluster_popularity['business_id'].values[:k_recommendations]
    if len(top_businesses) < k_recommendations:
        remaining = k_recommendations - len(top_businesses)
        available = np.setdiff1d(all_business_ids, top_businesses)
        extra_businesses = np.random.choice(available, size=remaining, replace=False)
        top_businesses = np.concatenate([top_businesses, extra_businesses])
    scores = np.ones(k_recommendations) / k_recommendations  # Uniform scores
    cluster_recommendations[cluster_id] = (top_businesses, scores)

In [23]:
# Handle noise points (-1 cluster) with a default recommendation set
noise_popularity = train_data[train_data['cluster'] == -1].groupby('business_id').agg(
    positive_reviews=('stars_review', lambda x: (x >= 4).sum()),
    total_reviews=('stars_review', 'count')
).reset_index()
noise_popularity['weighted_score'] = noise_popularity['positive_reviews'] / (noise_popularity['total_reviews'] + 1)
noise_popularity = noise_popularity.sort_values(by='weighted_score', ascending=False)
noise_businesses = noise_popularity['business_id'].values[:k_recommendations]
if len(noise_businesses) < k_recommendations:
    remaining = k_recommendations - len(noise_businesses)
    available = np.setdiff1d(all_business_ids, noise_businesses)
    extra_businesses = np.random.choice(available, size=remaining, replace=False)
    noise_businesses = np.concatenate([noise_businesses, extra_businesses])
noise_scores = np.ones(k_recommendations) / k_recommendations
cluster_recommendations[-1] = (noise_businesses, noise_scores)

In [24]:
# Function to generate uniform recommendations based on user cluster
def predict_uniform_cluster_interests(user_id, k=k_recommendations, user_features=user_features, cluster_recommendations=cluster_recommendations):
    if user_id in user_features['user_id'].values:
        cluster_id = user_features[user_features['user_id'] == user_id]['cluster'].values[0]
    else:
        cluster_id = -1  # Default to noise cluster for unseen users
    recommended_businesses, scores = cluster_recommendations[cluster_id]
    k = min(k, len(recommended_businesses))
    return list(zip(recommended_businesses[:k], scores[:k]))

In [25]:
# Calculate test data statistics
positive_reviews = test_data[test_data['stars_review'] >= 4]
negative_reviews = test_data[test_data['stars_review'] <= 2]
print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 135929
Number of negative reviews: 33369
Total number of reviews: 196084
Ratio of positive to negative reviews: 4.07


In [26]:
# Group test data by user_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

In [27]:
# Generate recommendations
recommendations = {}
max_users = min(1000, len(test_data_grouped))  # Limit to 1000 users

for i in range(max_users):
    user_id = test_data_grouped['user_id'].iloc[i]
    recommendation = predict_uniform_cluster_interests(user_id, k=k_recommendations)
    business_ids, scores = zip(*recommendation)
    recommendations[user_id] = (list(business_ids), list(scores))

In [28]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & 
                                       (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [29]:
# Evaluate recommendations
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)

In [30]:
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [31]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1868,850,2718,0.68727


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.3447,0.8783,0.0541,0.1019,0.0666,0.1996


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,101,836,14,1767


In [32]:
user_features.to_csv("clustered_user_data.csv", index=False)