### Uniform Recommendation Baseline,
    This notebook implements a UserCF & clustering model for the Yelp dataset. It divides users into clusters, recommends a set of uniform businesses to each cluster by using UserCF and evaluates the performance using retrieval metrics.
    
#### Pre-requisites,
- The Yelp dataset is loaded from the processed data folder (`../../data/processed_data/yelp_data/`).

In [24]:
import sys
sys.path.append('../')
from utilities import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from hdbscan import HDBSCAN
from IPython.display import display
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
df_business = yelp_data['business'] # Business details
df_categories = yelp_data['categories'] # Business categories
df_review = yelp_data['review'] # Review data

In [None]:
df_review['label'] = df_review['stars'].apply(lambda x: 1 if x > 4 else 0)

In [None]:
# Merge review and business data
df_review_merged = df_review.merge(df_business, on='business_id', how='left', suffixes=('_review', '_business'))

In [None]:
# Prepare category features (top 5 categories as binary indicators)
top_categories = df_categories['category'].value_counts().index[:5]
for cat in top_categories:
    df_review_merged[f'category_{cat}'] = df_review_merged['business_id'].isin(
        df_categories[df_categories['category'] == cat]['business_id']
    ).astype(int)

In [None]:
# Define features for clustering with corrected column names
features = ['stars_review', 'useful', 'funny', 'cool', 'latitude', 'longitude', 
            'stars_business', 'review_count'] + [f'category_{cat}' for cat in top_categories]

In [None]:
# Aggregate features at the user level (mean values)
user_features = df_review_merged.groupby('user_id')[features].mean().reset_index()
print(f"Number of unique users in user_features: {len(user_features)}")

Number of unique users in user_features: 162079


In [14]:
# Split data into training and test sets
train_data, test_data = train_test_split(df_review_merged, test_size=0.2, random_state=42)
print(f"Number of rows in train_data: {len(train_data)}")

Number of rows in train_data: 784334


In [15]:
# Prepare feature matrix for clustering (using training data)
X = user_features[features].fillna(0).values
print(f"Shape of X for clustering: {X.shape}")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Shape of X for clustering: (162079, 13)


In [16]:
# Apply PCA (10 components explaining 90% variance)
pca = PCA(n_components=10)
X_pca_reduced = pca.fit_transform(X_scaled)
print(f"Shape of X_pca_reduced: {X_pca_reduced.shape}")

Shape of X_pca_reduced: (162079, 10)


In [17]:
# Perform HDBSCAN clustering on users
hdbscan = HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_epsilon=0.3, metric='euclidean')
user_clusters = hdbscan.fit_predict(X_pca_reduced.astype(np.float32))
print(f"Length of user_clusters: {len(user_clusters)}")  # Should match 162,079
user_features['cluster'] = user_clusters



Length of user_clusters: 162079


In [18]:
# Merge cluster labels back to review data
train_data = train_data.merge(user_features[['user_id', 'cluster']], on='user_id', how='left')
test_data = test_data.merge(user_features[['user_id', 'cluster']], on='user_id', how='left')

In [21]:
# Create sparse user-item matrix for training data
users = train_data['user_id'].unique()
businesses = train_data['business_id'].unique()
user_id_map = {uid: i for i, uid in enumerate(users)}
business_id_map = {bid: i for i, bid in enumerate(businesses)}

rows = train_data['user_id'].map(user_id_map)
cols = train_data['business_id'].map(business_id_map)
data = train_data['stars_review']
user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(businesses)))
print(f"Sparse matrix shape: {user_item_matrix.shape}, non-zero elements: {user_item_matrix.nnz}")

Sparse matrix shape: (148394, 70480), non-zero elements: 768999


In [22]:
# Function to compute UserCF predictions within clusters using sparse matrix
def predict_usercf_clustered(user_id, business_id, user_item_matrix, user_features, cluster_id, user_id_map, business_id_map, k=10):
    if user_id not in user_id_map:
        return np.mean(user_item_matrix.data)  # Cold-start: return average of non-zero ratings
    
    user_idx = user_id_map[user_id]
    
    # Get users in the same cluster
    cluster_users = user_features[user_features['cluster'] == cluster_id]['user_id']
    cluster_indices = [user_id_map[uid] for uid in cluster_users if uid in user_id_map]
    
    if len(cluster_indices) <= 1:  # If cluster is too small, use all users
        cluster_indices = list(range(user_item_matrix.shape[0]))
    
    # Extract cluster submatrix
    cluster_matrix = user_item_matrix[cluster_indices]
    
    # Compute cosine similarity
    user_vector = user_item_matrix[user_idx].reshape(1, -1)
    similarities = cosine_similarity(user_vector, cluster_matrix).flatten()
    similarity_dict = dict(zip(cluster_indices, similarities))
    
    # Sort by similarity and take top k
    top_k_users = sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)[:k]
    top_k_indices = [u for u, _ in top_k_users]
    top_k_similarities = [s for _, s in top_k_users]
    
    # Predict rating for the business
    if business_id not in business_id_map:
        return np.mean(user_item_matrix.data)  # Cold-start for unseen business
    
    business_idx = business_id_map[business_id]
    ratings = user_item_matrix[top_k_indices, business_idx].toarray().flatten()
    weighted_sum = sum(s * r for s, r in zip(top_k_similarities, ratings) if r > 0)
    sim_sum = sum(s for s, r in zip(top_k_similarities, ratings) if r > 0)
    
    return weighted_sum / sim_sum if sim_sum > 0 else np.mean(user_item_matrix.data)

In [23]:
# Generate recommendations for test users
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()
recommendations = {}
k_recommendations = 300
max_users = min(1000, len(test_data_grouped))

for i in range(max_users):
    user_id = test_data_grouped['user_id'].iloc[i]
    cluster_id = user_features[user_features['user_id'] == user_id]['cluster'].values[0]
    # Get all businesses not yet reviewed by the user
    reviewed_businesses = set(train_data[train_data['user_id'] == user_id]['business_id'])
    all_businesses = set(business_id_map.keys())
    candidate_businesses = all_businesses - reviewed_businesses
    
    # Predict ratings for candidate businesses
    predicted_ratings = []
    for business_id in candidate_businesses:
        rating = predict_usercf_clustered(user_id, business_id, user_item_matrix, user_features, cluster_id, user_id_map, business_id_map)
        predicted_ratings.append((business_id, rating))
    
    # Sort by predicted rating and take top k
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)
    top_k = predicted_ratings[:k_recommendations]
    business_ids, scores = zip(*top_k) if top_k else ([], [])
    recommendations[user_id] = (list(business_ids), list(scores))

KeyboardInterrupt: 

In [None]:
# Evaluation function (same as before)
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & 
                                       (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [None]:
# Evaluate recommendations
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)

In [None]:
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [None]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)