In [1]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import hdbscan
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score


### User top 5 category

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [3]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [4]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [5]:
df_business = yelp_data['business'] # Business details
df_categories = yelp_data['categories'] # Business categories
df_review = yelp_data['review'] # Review data


In [6]:
all_merged_df = pd.merge(df_review, df_categories, on='business_id', how='left')

In [7]:
all_merged_df = all_merged_df[['review_id', 'business_id', 'user_id', 'stars', 'category']]
all_merged_df

Unnamed: 0,review_id,business_id,user_id,stars,category
0,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Restaurants
1,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Gluten-Free
2,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Bars
3,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Food
4,f1MJKwk8Nk2MNxns6-tvfg,8yR12PNSMo6FBYx1u5KPlw,BDwy_2vFLT2urUhvNrKDLQ,4.0,Nightlife
...,...,...,...,...,...
5225694,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Beer Bar
5225695,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Wine Bars
5225696,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Bars
5225697,5RSkQpctcinv77VKyXgh2Q,g7-sn0P60uwbOJejnoRIQA,lHqA_D7nAK9xw_Xg08dM_w,4.0,Cideries


In [10]:
overall_category_freq = all_merged_df['category'].value_counts()
# remove two most popular categories
overall_category_freq = overall_category_freq[2:]
overall_category_freq

category
Nightlife                 247762
Bars                      234812
American (Traditional)    157952
American (New)            156821
Breakfast & Brunch        130938
                           ...  
Cheerleading                   1
Trade Fairs                    1
Duplication Services           1
Apartment Agents               1
Ranches                        1
Name: count, Length: 1237, dtype: int64

In [12]:
# output the overall category frequency to excel file
overall_category_freq.to_excel('../../src/data_processing/overall_category_freq.xlsx')

In [9]:
top_100_categories = overall_category_freq.nlargest(100).index.tolist()
overall_category_prop = overall_category_freq / overall_category_freq.sum()

In [11]:
user_review_counts = df_review['user_id'].value_counts()
# ignore users with less than 5 reviews
filtered_users = user_review_counts[user_review_counts >= 2].index

In [12]:
user_review_counts.head(1000)

user_id
_BcWyKQL16ndpBdggh2kNA    953
ET8n-r7glWYqZhuR6GcdNw    576
Xw7ZjaGfr0WNVt6s_5KZfA    550
0Igx-a1wAstiBDerGxXk2A    541
1HM81n6n4iPIFU5d2Lokhw    535
                         ... 
BewDV2ISn0uAMLcsyrhiZg     77
O_-EMEYwYSq-_v_7NlS5iA     77
PAc93PtEbYDtytBQ9Dyjug     77
Mbw8FWl4dok7ezTRdC8HOw     77
iHB75LCfFkIC1Oj86CqT1A     77
Name: count, Length: 1000, dtype: int64

In [13]:
filtered_df_review = df_review[df_review['user_id'].isin(filtered_users)]
merged_df = pd.merge(filtered_df_review, df_categories, on='business_id', how='left')
user_category_matrix = merged_df.groupby(['user_id', 'category']).size().unstack(fill_value=0)

In [14]:
alpha = 3.0
smoothed_matrix = user_category_matrix + alpha * overall_category_prop

In [15]:
user_top_categories = smoothed_matrix.apply(lambda x: x.nlargest(5).index.tolist(), axis=1)
user_top_categories

In [None]:
conn = sqlite3.connect('../../data/processed_data/yelp_data/yelp_user_data.db')
cursor = conn.cursor()
cursor.execute("SELECT * FROM user_data")
user_data = cursor.fetchall()
cols = [column[0] for column in cursor.description]
user_df = pd.DataFrame(user_data, columns=cols)
display(user_df.head())

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,fans,average_stars,friends,...,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,categories
0,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,3138,3.74,,...,184,157,251,1847,7054,3131,3131,1521,1946,"[""Nightlife"", ""Hotels & Travel"", ""Local Flavor..."
1,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,28,4.27,,...,1,6,2,12,16,26,26,10,9,"[""Shopping"", ""Fashion"", ""Arts & Crafts"", ""Arts..."
2,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1221,2005-03-14 20:26:35,14953,9940,11211,1357,3.85,,...,191,361,147,1212,5696,2543,2543,815,323,"[""Mexican"", ""Nightlife"", ""Bars"", ""Breakfast & ..."
3,AUi8MPWJ0mLkMfwbui27lg,John,109,2010-01-07 18:32:04,154,20,23,4,3.4,,...,0,0,0,1,6,3,3,0,0,"[""Event Planning & Services"", ""Hotels & Travel..."
4,xoZvMJPDW6Q9pDAXI0e_Ww,Ryan,535,2009-05-27 06:12:10,1130,487,573,31,3.89,,...,1,3,5,31,41,36,36,24,7,"[""Arts & Entertainment"", ""Hotels & Travel"", ""C..."


In [None]:
# Add the new column (will be NULL for all existing rows)
# cursor.execute("ALTER TABLE user_data ADD COLUMN categories TEXT;")
# conn.commit()


In [None]:
# import json
# # Wrap all updates in a single transaction for speed
# cursor.execute("BEGIN TRANSACTION;")
# # Prepare the update statement
# update_sql = "UPDATE user_data SET categories = ? WHERE user_id = ?;"
# for user_id, cat_list in user_top_categories.items():
#     # Serialize the list to JSON (or you could use ','.join(cat_list))
#     cat_json = json.dumps(cat_list, ensure_ascii=False)
#     cursor.execute(update_sql, (cat_json, user_id))
# conn.commit()

In [None]:
# Commit once at the end
conn.close()

In [19]:
user_top_categories.to_excel('../../src/data_processing/user_top_categories.xlsx')


### Clustering

In [97]:
mlb = MultiLabelBinarizer()
mlb.fit(user_top_categories.values)
with open('users_categories_encoder.pkl', 'wb') as f:
    pickle.dump(mlb, f)

In [None]:
# # 1 clusterer = hdbscan.HDBSCAN(metric='jaccard', min_samples=1, min_cluster_size=5)
# # 2 clusterer = hdbscan.HDBSCAN(metric='jaccard', min_samples=1, min_cluster_size=10)
# clusterer = hdbscan.HDBSCAN(metric='jaccard', min_samples=3, min_cluster_size=5, cluster_selection_epsilon=0.3)
# labels = clusterer.fit_predict(encoded_data)

# # run time (36, 30, 50 minutes)

In [None]:
# label_df = pd.DataFrame({'user_id': user_top_categories.index, 'label': labels})
# label_df.to_excel('../../src/data_processing/user_labels.xlsx')

In [105]:
# read the clustered_user.xlsx file
clustered_user = pd.read_excel('../../src/data_processing/clustered_users.xlsx')

In [98]:
encoded_data = mlb.transform(user_top_categories.values)

In [None]:
clustered_user_df = pd.DataFrame(encoded_data, columns=mlb.classes_)
clustered_user_df['user_id'] = user_top_categories.index
# switch the order of the columns to move user_id to the first column
clustered_user_df = clustered_user_df[['user_id'] + list(clustered_user_df.columns[:-1])]

clustered_user_df = pd.merge(clustered_user_df, clustered_user, on='user_id', how='left')

# Saving to a pickle file
clustered_user_df.to_pickle('clustered_user_df.pkl')

In [126]:
def find_nearest_neighbor(encoded_try_list: np.ndarray, clustered_user_df: pd.DataFrame):
    """
    Finds the nearest neighbor(s) for each record in the encoded_try_list using Jaccard similarity.

    Parameters:
        encoded_try_list (np.ndarray): A numpy array containing one or more encoded user category records.
        clustered_user_df (pd.DataFrame): DataFrame containing user_ids and encoded category features.
           It is expected that the first column (or a column named 'user_id') holds the user ID,
           and the remaining columns are the encoded features.

    Returns:
        results (pd.DataFrame): A DataFrame with original user_ids from clustered_user_df that are
                                the nearest neighbors along with the computed distances.
    """
    # Ensure the clustered_user_df has a 'user_id' column and encoded category columns.
    if 'user_id' not in clustered_user_df.columns:
        raise ValueError("clustered_user_df must include a 'user_id' column.")

    # Select the encoded features from clustered_user_df; assume all columns except 'user_id'
    features = clustered_user_df.drop(columns=['user_id', 'cluster']).values

    # Create the NearestNeighbors model using Jaccard metric
    nn_model = NearestNeighbors(n_neighbors=1, metric='jaccard')
    nn_model.fit(features)

    # Query the model to get nearest neighbor index and distance
    distances, indices = nn_model.kneighbors(encoded_try_list)

    # Create a list to hold results
    results = []
    for idx, (dist, ind) in enumerate(zip(distances, indices)):
        # Get the corresponding user_id from clustered_user_df using the index
        nearest_cluster_id = clustered_user_df.iloc[ind[0]]['cluster']
        results.append({
            'input_index': idx,
            'nearest_cluster_id': nearest_cluster_id,
            # 'distance': dist[0]
        })

    # Convert to DataFrame for easier downstream processing or logging
    # results_df = pd.DataFrame(results)
    return results

In [152]:
# Later, loading from the pickle file
clustered_user_df = pd.read_pickle('clustered_user_df.pkl')

try_list = [['American (Traditional)', 'Sandwiches', 'Burgers', 'Fast Food', 'Pizza'],
            ['Nightlife', 'Delis', 'Food', 'Fast Food', 'Pizza'],]

with open('users_categories_encoder.pkl', 'rb') as f:
    user_category_encoder = pickle.load(f)
    f.close()

try_list = np.array(try_list)

encoded_try_list = user_category_encoder.transform(try_list)

result_df = find_nearest_neighbor(encoded_try_list,clustered_user_df)
result_df[0]



{'input_index': 0, 'nearest_cluster_id': np.int64(2253)}

In [None]:
# # Step 3: Assign any noise points to the nearest cluster
# noise_indices = np.where(labels == -1)[0]  # Indices of users labeled as noise
# if len(noise_indices) > 0:
#     cluster_indices = np.where(labels != -1)[0]  # Indices of users in clusters
#     if len(cluster_indices) > 0:
#         # Fit NearestNeighbors on clustered points using Jaccard distance
#         nn = NearestNeighbors(n_neighbors=1, metric='jaccard')
#         nn.fit(encoded_data[cluster_indices])
#         # Find nearest clustered neighbor for each noise point
#         distances, indices = nn.kneighbors(encoded_data[noise_indices])
#         # Assign noise points to the cluster of their nearest neighbor
#         nearest_labels = labels[cluster_indices[indices.flatten()]]
#         labels[noise_indices] = nearest_labels
#     else:
#         # Rare edge case: all points are noise; assign all to cluster 0
#         labels[noise_indices] = 0



In [None]:
# # Step 4: Create a DataFrame with user IDs and cluster labels
# clustered_users = pd.DataFrame({'cluster': labels}, index=user_top_categories.index)

# # Output the result
# # print(clustered_users)

In [None]:
# clustered_users.to_excel('../../src/data_processing/clustered_users.xlsx')

### Evaluate the cluster

In [None]:

# # Assume 'encoded_data' is your multi-hot encoded data (47,000 x num_categories)
# # Assume 'labels' are the cluster labels for all 47,000 users

# # Sample 10% of the data
# sample_size = int(0.1 * encoded_data.shape[0])  # ~4,700 users
# sample_indices = np.random.choice(encoded_data.shape[0], sample_size, replace=False)
# sample_data = encoded_data[sample_indices]
# sample_labels = labels[sample_indices]

# # Compute Sampled Silhouette Score with Jaccard distance
# sil_score = silhouette_score(sample_data, sample_labels, metric='jaccard')

# print("Sampled Silhouette Score:", sil_score)



Sampled Silhouette Score: 0.359193194525424


In [None]:
# # Compute Davies-Bouldin Index on the entire dataset
# db_score = davies_bouldin_score(encoded_data, labels)

# print("Davies-Bouldin Index:", db_score)

Davies-Bouldin Index: 1.7984987603812381


### log

Silhouette near 1 better; Davies-Bouldin smaller better

1. Silhouette: 0.3737, Davies-Bouldin: 1.6766
2. Silhouette: 0.2876259446432755, Davies-Bouldin: 1.8501757095975369
3. Silhouette: 0.3951916642699614, Davies-Bouldin: 1.7216348683124556