In [1]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import hdbscan
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score


### User top 5 category

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [3]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [4]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [5]:
df_business = yelp_data['business'] # Business details
df_categories = yelp_data['categories'] # Business categories
df_review = yelp_data['review'] # Review data


In [6]:
all_merged_df = pd.merge(df_review, df_categories, on='business_id', how='left')

In [7]:
all_merged_df = all_merged_df[['review_id', 'business_id', 'user_id', 'stars', 'category']]
all_merged_df

Unnamed: 0,review_id,business_id,user_id,stars,category
0,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Restaurants
1,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Gluten-Free
2,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Bars
3,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Food
4,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Nightlife
...,...,...,...,...,...
5225694,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Beer Bar
5225695,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Wine Bars
5225696,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Bars
5225697,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Cideries


In [8]:
overall_category_freq = all_merged_df['category'].value_counts()
# remove two most popular categories
overall_category_freq = overall_category_freq[2:]
overall_category_freq

category
Nightlife                 247762
Bars                      234812
American (Traditional)    157952
American (New)            156821
Breakfast & Brunch        130938
                           ...  
Cheerleading                   1
Trade Fairs                    1
Duplication Services           1
Apartment Agents               1
Ranches                        1
Name: count, Length: 1237, dtype: int64

In [9]:
# output the overall category frequency to excel file
overall_category_freq.to_excel('../../src/data_processing/overall_category_freq.xlsx')

In [10]:
top_100_categories = overall_category_freq.nlargest(100).index.tolist()
overall_category_prop = overall_category_freq / overall_category_freq.sum()

In [11]:
user_review_counts = df_review['user_id'].value_counts()
# ignore users with less than 5 reviews
filtered_users = user_review_counts[user_review_counts >= 2].index

In [12]:
user_review_counts.head(1000)

user_id
_BcWyKQL16ndpBdggh2kNA    953
ET8n-r7glWYqZhuR6GcdNw    576
Xw7ZjaGfr0WNVt6s_5KZfA    550
0Igx-a1wAstiBDerGxXk2A    541
1HM81n6n4iPIFU5d2Lokhw    535
                         ... 
BewDV2ISn0uAMLcsyrhiZg     77
O_-EMEYwYSq-_v_7NlS5iA     77
PAc93PtEbYDtytBQ9Dyjug     77
Mbw8FWl4dok7ezTRdC8HOw     77
iHB75LCfFkIC1Oj86CqT1A     77
Name: count, Length: 1000, dtype: int64

In [13]:
filtered_df_review = df_review[df_review['user_id'].isin(filtered_users)]
merged_df = pd.merge(filtered_df_review, df_categories, on='business_id', how='left')
user_category_matrix = merged_df.groupby(['user_id', 'category']).size().unstack(fill_value=0)

In [14]:
alpha = 3.0
smoothed_matrix = user_category_matrix + alpha * overall_category_prop

In [15]:
user_top_categories = smoothed_matrix.apply(lambda x: x.nlargest(5).index.tolist(), axis=1)

In [16]:
user_top_categories.to_excel('../../src/data_processing/user_top_categories.xlsx')

### Clustering

In [17]:
mlb = MultiLabelBinarizer()
encoded_data = mlb.fit_transform(user_top_categories.values)

In [28]:
encoded_data.shape

(99812, 698)

In [18]:
# 1 clusterer = hdbscan.HDBSCAN(metric='jaccard', min_samples=1, min_cluster_size=5)
# 2 clusterer = hdbscan.HDBSCAN(metric='jaccard', min_samples=1, min_cluster_size=10)
clusterer = hdbscan.HDBSCAN(metric='jaccard', min_samples=3, min_cluster_size=5, cluster_selection_epsilon=0.3)
labels = clusterer.fit_predict(encoded_data)

# run time (36, 30, 50 minutes)

In [19]:
label_df = pd.DataFrame({'user_id': user_top_categories.index, 'label': labels})
label_df.to_excel('../../src/data_processing/user_labels.xlsx')

In [20]:
# Step 3: Assign any noise points to the nearest cluster
noise_indices = np.where(labels == -1)[0]  # Indices of users labeled as noise
if len(noise_indices) > 0:
    cluster_indices = np.where(labels != -1)[0]  # Indices of users in clusters
    if len(cluster_indices) > 0:
        # Fit NearestNeighbors on clustered points using Jaccard distance
        nn = NearestNeighbors(n_neighbors=1, metric='jaccard')
        nn.fit(encoded_data[cluster_indices])
        # Find nearest clustered neighbor for each noise point
        distances, indices = nn.kneighbors(encoded_data[noise_indices])
        # Assign noise points to the cluster of their nearest neighbor
        nearest_labels = labels[cluster_indices[indices.flatten()]]
        labels[noise_indices] = nearest_labels
    else:
        # Rare edge case: all points are noise; assign all to cluster 0
        labels[noise_indices] = 0



In [21]:
# Step 4: Create a DataFrame with user IDs and cluster labels
clustered_users = pd.DataFrame({'cluster': labels}, index=user_top_categories.index)

# Output the result
# print(clustered_users)

In [22]:
clustered_users.to_excel('../../src/data_processing/clustered_users.xlsx')

### Evaluate the cluster

In [23]:

# Assume 'encoded_data' is your multi-hot encoded data (47,000 x num_categories)
# Assume 'labels' are the cluster labels for all 47,000 users

# Sample 10% of the data
sample_size = int(0.1 * encoded_data.shape[0])  # ~4,700 users
sample_indices = np.random.choice(encoded_data.shape[0], sample_size, replace=False)
sample_data = encoded_data[sample_indices]
sample_labels = labels[sample_indices]

# Compute Sampled Silhouette Score with Jaccard distance
sil_score = silhouette_score(sample_data, sample_labels, metric='jaccard')

print("Sampled Silhouette Score:", sil_score)



Sampled Silhouette Score: 0.359193194525424


In [24]:
# Compute Davies-Bouldin Index on the entire dataset
db_score = davies_bouldin_score(encoded_data, labels)

print("Davies-Bouldin Index:", db_score)

Davies-Bouldin Index: 1.7984987603812381


### log

Silhouette near 1 better; Davies-Bouldin smaller better

1. Silhouette: 0.3737, Davies-Bouldin: 1.6766
2. Silhouette: 0.2876259446432755, Davies-Bouldin: 1.8501757095975369
3. Silhouette: 0.3951916642699614, Davies-Bouldin: 1.7216348683124556