In [18]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
import faiss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input, Layer, Lambda
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model


In [19]:
class CategoryPoolingLayer(Layer):
    def __init__(self, **kwargs):
        super(CategoryPoolingLayer, self).__init__(**kwargs)

    def call(self, inputs):
        return tf.reduce_mean(inputs, axis=1)

In [20]:
# Path to saved models and encoders
save_folder_path = 'DSSM_Models/Triplet_Hinge_Loss/'

# Load the saved models
user_model = load_model(save_folder_path + 'user_model.keras')
item_model = load_model(save_folder_path + 'item_model.keras',                
        custom_objects={'CategoryPoolingLayer': CategoryPoolingLayer}
    )

# Load the saved label encoders
with open(save_folder_path + 'user_id_encoder.pkl', 'rb') as f:
    user_id_encoder = pickle.load(f)

with open(save_folder_path + 'business_id_encoder.pkl', 'rb') as f:
    business_id_encoder = pickle.load(f)

with open(save_folder_path + 'categories_encoder.pkl', 'rb') as f:
    categories_encoder = pickle.load(f)

# Load the saved scalers
with open(save_folder_path + 'user_scaler.pkl', 'rb') as f:
    user_scaler = pickle.load(f)

with open(save_folder_path + 'business_scaler.pkl', 'rb') as f:
    business_scaler = pickle.load(f)

In [21]:
# Define the database folder path and file names
db_folder = '../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db', 'yelp_user_data.db', 'yelp_tip_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [22]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
        data['user'] = pd.read_sql_query("SELECT * FROM user_data", conns[2])
        data['tip'] = pd.read_sql_query("SELECT * FROM tip_data", conns[3])
        
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()

    return data

In [23]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [24]:
user_con_feature_lst = [
                        'review_count', 
                        'useful', 
                        'funny', 
                        'cool', 
                        'fans', 
                        'average_stars'
                        ]
business_con_feature_lst = [
                        'stars', 
                        'review_count', 
                        # 'latitude', 
                        # 'longitude'
                        ]

# add user features that start with 'compliment_'
user_compliment_feature_lst = [col for col in yelp_data['user'].columns if 'compliment_' in col]
user_con_feature_lst += user_compliment_feature_lst

In [25]:
# Preprocess user data
user_df = yelp_data['user']
user_df['yelping_since'] = pd.to_datetime(user_df['yelping_since'])

# Preprocess business data
business_df = yelp_data['business']
business_df['is_open'] = business_df['is_open'].fillna(0).astype(int)

# Preprocess review data
review_df = yelp_data['review']
# Create labels for review data
review_df['label'] = (review_df['stars'] >= 4).astype(int)

# Preprocess tip data
tip_df = yelp_data['tip']

# Preprocess categories data
categories_df = yelp_data['categories']


In [26]:
# Filter out unseen user_id and business_id
review_df = review_df[
    (review_df['user_id'].isin(user_id_encoder.classes_)) & 
    (review_df['business_id'].isin(business_id_encoder.classes_))
]

# Encode user_id and business_id
review_df['user_id_encoded'] = user_id_encoder.transform(review_df['user_id'])
review_df['business_id_encoded'] = business_id_encoder.transform(review_df['business_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['user_id_encoded'] = user_id_encoder.transform(review_df['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['business_id_encoded'] = business_id_encoder.transform(review_df['business_id'])


In [27]:
# Filter out unseen categories
categories_df = categories_df[categories_df['category'].isin(categories_encoder.classes_)]
categories_df['category_encoded'] = categories_encoder.transform(categories_df['category'])

categories_grouped = categories_df.groupby('business_id')['category_encoded'].apply(list).reset_index()

# Merge the categories with the business data, name the column 'category_encoded'
business_df = business_df.merge(categories_grouped, on='business_id', how='left')

In [28]:
# Count the number of reviews and average review for each business
business_review_count = review_df.groupby('business_id').size()
business_avg_review = review_df.groupby('business_id')['stars'].mean()
business_df['review_count'] = business_review_count
business_df['avg_review'] = business_avg_review # similar to stars, but this is adjusted for the number of reviews extracted


In [29]:

# Example: Extract numerical features for embedding
user_features = user_df[user_con_feature_lst].fillna(0)
# Example: Extract numerical features
business_features = business_df[business_con_feature_lst].fillna(0)


In [30]:
user_df['user_id_encoded'] = user_id_encoder.fit_transform(user_df['user_id'])
business_df['business_id_encoded'] = business_id_encoder.fit_transform(business_df['business_id'])

# Save number of unique users and businesses for embedding input_dim
num_users = user_df['user_id_encoded'].max() + 1
num_businesses = business_df['business_id_encoded'].max() + 1


In [31]:
# Standardize user continuous features
user_continuous_features = user_df[['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']].fillna(0)
user_scaler = StandardScaler()
user_continuous_features_scaled = user_scaler.fit_transform(user_continuous_features)

# Standardize business continuous features
business_continuous_features = business_df[['stars', 'review_count', 'latitude', 'longitude']].fillna(0)
business_scaler = StandardScaler()
business_continuous_features_scaled = business_scaler.fit_transform(business_continuous_features)

# Ensure continuous features are pandas DataFrames
user_continuous_features_scaled = pd.DataFrame(user_continuous_features_scaled, index=user_df['user_id_encoded'])
business_continuous_features_scaled = pd.DataFrame(business_continuous_features_scaled, index=business_df['business_id_encoded'])

In [32]:
business_category_map = business_df.set_index('business_id_encoded')['category_encoded']

In [33]:
# Split review_df into train and test sets
train_df, test_df = train_test_split(review_df, test_size=0.2, random_state=42)

In [34]:
# Step 1: Prepare the Faiss index for business embeddings
def create_faiss_index(item_model, business_ids, business_cont_features, business_category_map, max_category_length=5):
    business_categories = business_category_map.loc[business_ids].apply(
        lambda x: x if isinstance(x, list) else []
    )
    business_category_padded = pad_sequences(business_categories.tolist(), maxlen=max_category_length, padding="post")

    # Predict embeddings
    business_embeddings = item_model.predict([business_ids, business_category_padded, business_cont_features])

    # Create a Faiss index for L2 similarity
    index = faiss.IndexFlatL2(business_embeddings.shape[1])  # Assuming 16-dimensional embeddings
    index.add(business_embeddings)
    return index, business_embeddings

business_ids = business_continuous_features_scaled.index.values
faiss_index, business_embeddings = create_faiss_index(
    item_model, business_ids, 
    business_continuous_features_scaled.values, 
    business_category_map
)

[1m2440/2440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 991us/step


In [35]:
# Step 2: Query top-k businesses for a given user
def query_top_k(user_id, user_model, faiss_index, k=5):
    # Encode user_id and get continuous features
    user_id_encoded = user_id_encoder.transform([user_id])[0]
    user_cont_features = user_scaler.transform(
        user_continuous_features_scaled.loc[[user_id_encoded]].values
    )

    # Predict the user's embedding
    user_embedding = user_model.predict([np.array([user_id_encoded]), user_cont_features])

    # Perform ANN search using Faiss
    distances, indices = faiss_index.search(user_embedding, k)

    # Return top-k businesses and distances
    top_k_business_ids = business_ids[indices.flatten()]
    return top_k_business_ids, distances.flatten()

In [36]:
test_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,label,user_id_encoded,business_id_encoded
238774,JellhrJZmZeWbmpGk1ox1A,9HQLEChkam3GMBQn0SmvVw,GvmLqW2tMkQ7F2hhOB8vSw,5.0,2017-01-01 17:38:18,My mother in law lives by Tucson Estates so at...,2,0,1,1,36725,22025
763006,yU4OqzNxSTlIh1liOTyXSQ,GKCzx6kfI1roSojfoPFsfA,tsx84z4c0B-y6J5fqfvBqg,4.0,2019-07-18 23:41:12,The food was really good. There was no wait fo...,1,0,0,1,62034,70580
878504,08lb0_fFuyCc01X21E_R9Q,eTzE7DauSODqmviZ5YfCTg,tM32Az6rP1L_flhNcObl0w,3.0,2021-11-19 01:42:06,"I think it's on the way to four stars, but not...",3,0,0,0,152054,69948
400581,U182l8gkfSM4xm7r8kRSeg,pPoQ0qeWVDzvdmJAFfH70g,0Xm1wedwnMJ1iKXz8vUDSw,5.0,2014-10-26 17:19:11,"Had the scotch eggs, corned beef and cabbage a...",5,1,0,1,191193,1967
446141,JLiLDkATwZxjnfAfCfF8rA,FCI2XcNy9zYQo0EV_pEBQQ,zuDXSlqm2veuwTD_Kl-pkw,5.0,2019-12-03 01:48:30,I thought working at Panera in high school was...,1,0,1,1,58034,77940


In [38]:
# Step 3: Example usage
user_id = "9HQLEChkam3GMBQn0SmvVw"  # Replace with an actual user_id from your dataset
top_k_business_ids, scores = query_top_k(user_id, user_model, faiss_index, k=100)

# Decode business IDs back to their original format
decoded_business_ids = business_id_encoder.inverse_transform(top_k_business_ids)
result_df = pd.DataFrame({
    'business_id': decoded_business_ids,
    'similarity_score': scores
})

print(result_df)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
               business_id  similarity_score
0   tttwCEpskb7HdSQS8szUFA          0.529427
1   iSnPc_3IHqywTvi9IQqxew          0.531635
2   C43IEVBroD_3YiumPPLLdQ          0.533646
3   lFYqN66bnwx8MiaIAtesoA          0.536562
4   iksVwRfpWymIUUFqw0tXpw          0.541931
..                     ...               ...
95  K-t2yan_iLwcxYf7-1Or5w          0.609884
96  f8WKIeT7HMAOedo54Nrd7Q          0.610242
97  ZgTXA6x_FX_KkRKu9KXy8A          0.610531
98  PxCyMdTBylodcabsidGDLA          0.610602
99  01YJCek52uMnAfmBbloX8A          0.610736

[100 rows x 2 columns]




In [3]:
index = faiss.IndexFlatL2(128)
index.add(data)

In [4]:
D, I = index.search(data[:5], 10)  # search for the 10 nearest neighbors of the first 5 vectors
print(I)  # Output the indices of the nearest neighbors

[[ 0 20 46 76  7 81 55 34 36 93]
 [ 1 14 49 68 70 62 42 67 93  8]
 [ 2 99 21 48 67 71 34 38 35 18]
 [ 3 91 50 79 14 70 81 35 60  7]
 [ 4 93 40 28 49  6 79 67 55 64]]
