In [41]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras import layers, Model, backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf

In [2]:
# Define the database folder path and file names
db_folder = '../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db', 'yelp_user_data.db', 'yelp_tip_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [3]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
        data['user'] = pd.read_sql_query("SELECT * FROM user_data", conns[2])
        data['tip'] = pd.read_sql_query("SELECT * FROM tip_data", conns[3])
        
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()

    return data

In [9]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [10]:
# Explore the data
# for table, df in yelp_data.items():
#     print(f"{table}:\n")
#     print(df.head(), "\n")
#     print(df.info(), "\n")

# Preprocess user data
user_df = yelp_data['user']
user_df['yelping_since'] = pd.to_datetime(user_df['yelping_since'])

# Example: Extract numerical features for embedding
user_features = user_df[['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']].fillna(0)

# Preprocess business data
business_df = yelp_data['business']
business_df['is_open'] = business_df['is_open'].fillna(0).astype(int)

# Example: Extract numerical features
business_features = business_df[['stars', 'review_count', 'latitude', 'longitude']].fillna(0)


In [11]:
# Encode user_id and business_id
user_id_encoder = LabelEncoder()
business_id_encoder = LabelEncoder()

user_df['user_id_encoded'] = user_id_encoder.fit_transform(user_df['user_id'])
business_df['business_id_encoded'] = business_id_encoder.fit_transform(business_df['business_id'])

# Save number of unique users and businesses for embedding input_dim
num_users = user_df['user_id_encoded'].max() + 1
num_businesses = business_df['business_id_encoded'].max() + 1


In [None]:
# # Example: Encode 'city' as a discrete feature for businesses
# business_city_encoder = LabelEncoder()
# business_df['city_encoded'] = business_city_encoder.fit_transform(business_df['city'])

# # Save number of unique cities for embedding input_dim
# num_cities = business_df['city_encoded'].max() + 1


In [12]:
# Standardize user continuous features
user_continuous_features = user_df[['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']].fillna(0)
user_scaler = StandardScaler()
user_continuous_features_scaled = user_scaler.fit_transform(user_continuous_features)

# Standardize business continuous features
business_continuous_features = business_df[['stars', 'review_count', 'latitude', 'longitude']].fillna(0)
business_scaler = StandardScaler()
business_continuous_features_scaled = business_scaler.fit_transform(business_continuous_features)


In [13]:
def create_embedding_layer(input_dim, output_dim, name):
    """Reusable function to create an embedding layer."""
    return layers.Embedding(input_dim=input_dim, output_dim=output_dim, name=f"{name}_embedding")

# Create embedding layers
user_id_embedding = create_embedding_layer(num_users, 16, "user_id")
business_id_embedding = create_embedding_layer(num_businesses, 16, "business_id")
# city_embedding = create_embedding_layer(num_cities, 8, "city")


In [14]:
def user_tower(num_users, continuous_dim):
    # Inputs
    user_id_input = layers.Input(shape=(1,), name="user_id")
    user_continuous_input = layers.Input(shape=(continuous_dim,), name="user_continuous")

    # Embedding
    user_id_embedded = user_id_embedding(user_id_input)
    user_id_embedded = layers.Flatten()(user_id_embedded)

    # Combine
    concat = layers.Concatenate()([user_id_embedded, user_continuous_input])
    x = layers.Dense(64, activation='relu')(concat)
    x = layers.Dense(32, activation='relu')(x)
    user_embedding = layers.Dense(16, activation=None, name="user_embedding")(x)

    return Model([user_id_input, user_continuous_input], user_embedding, name="UserTower")


In [15]:
def item_tower(num_businesses, continuous_dim):
    # Inputs
    business_id_input = layers.Input(shape=(1,), name="business_id")
    business_continuous_input = layers.Input(shape=(continuous_dim,), name="business_continuous")

    # Embedding
    business_id_embedded = business_id_embedding(business_id_input)
    business_id_embedded = layers.Flatten()(business_id_embedded)

    # Combine
    concat = layers.Concatenate()([business_id_embedded, business_continuous_input])
    x = layers.Dense(64, activation='relu')(concat)
    x = layers.Dense(32, activation='relu')(x)
    business_embedding = layers.Dense(16, activation=None, name="business_embedding")(x)

    return Model([business_id_input, business_continuous_input], business_embedding, name="ItemTower")


In [21]:
# Instantiate towers
user_model = user_tower(num_users, user_continuous_features_scaled.shape[1])
item_model = item_tower(num_businesses, business_continuous_features_scaled.shape[1])

# Inputs
user_inputs = [user_df['user_id_encoded'], user_continuous_features_scaled]
business_inputs = [business_df['business_id_encoded'], business_continuous_features_scaled]

# Get embeddings
user_emb = user_model(user_inputs)
business_emb = item_model(business_inputs)

# Dot product model
dot_product = tf.keras.layers.Dot(axes=-1)([user_model.output, item_model.output])
model = tf.keras.Model(inputs=[user_model.input, item_model.input], outputs=dot_product)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
review_df = yelp_data['review']

# Create labels for review data
review_df['label'] = (review_df['stars'] >= 4).astype(int)


In [35]:
train_df, test_df = train_test_split(review_df, test_size=0.2, random_state=42)

In [None]:
# def safe_transform(label_encoder, ids, default=-1):
#     """Safely transform IDs, assigning a default value to unseen IDs."""
#     known_ids = set(label_encoder.classes_)
#     return [label_encoder.transform([id_])[0] if id_ in known_ids else default for id_ in ids]

# # Safely encode IDs for train_df and test_df
# train_df['user_id_encoded'] = safe_transform(user_id_encoder, train_df['user_id'])
# train_df['business_id_encoded'] = safe_transform(business_id_encoder, train_df['business_id'])

# test_df['user_id_encoded'] = safe_transform(user_id_encoder, test_df['user_id'])
# test_df['business_id_encoded'] = safe_transform(business_id_encoder, test_df['business_id'])

# # Filter out rows where default (-1) was assigned (optional)
# train_df = train_df[(train_df['user_id_encoded'] != -1) & (train_df['business_id_encoded'] != -1)]
# test_df = test_df[(test_df['user_id_encoded'] != -1) & (test_df['business_id_encoded'] != -1)]


In [36]:
# Get all encoded user and business IDs
known_user_ids = set(user_id_encoder.classes_)
known_business_ids = set(business_id_encoder.classes_)

# Filter out rows with unseen IDs in train_df
train_df = train_df[
    train_df['user_id'].isin(known_user_ids) & 
    train_df['business_id'].isin(known_business_ids)
]

# Filter out rows with unseen IDs in test_df
test_df = test_df[
    test_df['user_id'].isin(known_user_ids) & 
    test_df['business_id'].isin(known_business_ids)
]

# Encode remaining IDs
train_df['user_id_encoded'] = user_id_encoder.transform(train_df['user_id'])
train_df['business_id_encoded'] = business_id_encoder.transform(train_df['business_id'])

test_df['user_id_encoded'] = user_id_encoder.transform(test_df['user_id'])
test_df['business_id_encoded'] = business_id_encoder.transform(test_df['business_id'])


In [39]:
# Training inputs
train_user_inputs = [
    train_df['user_id_encoded'].values,
    user_scaler.transform(user_df.loc[train_df['user_id_encoded'], user_continuous_features.columns])
]
train_item_inputs = [
    train_df['business_id_encoded'].values,
    business_scaler.transform(business_df.loc[train_df['business_id_encoded'], business_continuous_features.columns])
]

# Testing inputs
test_user_inputs = [
    test_df['user_id_encoded'].values,
    user_scaler.transform(user_df.loc[test_df['user_id_encoded'], user_continuous_features.columns])
]
test_item_inputs = [
    test_df['business_id_encoded'].values,
    business_scaler.transform(business_df.loc[test_df['business_id_encoded'], business_continuous_features.columns])
]

# Labels
train_labels = train_df['label'].values
test_labels = test_df['label'].values


In [40]:
# Train the model
model.fit(
    [train_user_inputs, train_item_inputs],
    train_labels,
    validation_data=([test_user_inputs, test_item_inputs], test_labels),
    epochs=10,
    batch_size=64
)


Epoch 1/10




[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 14ms/step - accuracy: 0.6515 - loss: 0.7345 - val_accuracy: 0.7209 - val_loss: 0.5724
Epoch 2/10
[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 15ms/step - accuracy: 0.7499 - loss: 0.5562 - val_accuracy: 0.7226 - val_loss: 0.5792
Epoch 3/10
[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 15ms/step - accuracy: 0.7783 - loss: 0.5033 - val_accuracy: 0.7203 - val_loss: 0.5949
Epoch 4/10
[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 16ms/step - accuracy: 0.7950 - loss: 0.4855 - val_accuracy: 0.7175 - val_loss: 0.5672
Epoch 5/10
[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 15ms/step - accuracy: 0.8067 - loss: 0.4651 - val_accuracy: 0.7155 - val_loss: 0.6242
Epoch 6/10
[1m12256/12256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 16ms/step - accuracy: 0.8056 - loss: 0.4761 - val_accuracy: 0.4032 - val_loss: 0.87

<keras.src.callbacks.history.History at 0x225dc2b5210>

In [42]:
# Predict similarity on test data
test_predictions = model.predict([test_user_inputs, test_item_inputs])

# Convert cosine similarity to binary predictions
predicted_labels = (test_predictions > 0.5).astype(int)

# Evaluate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Test Accuracy: {accuracy}")


[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step
Test Accuracy: 0.7026580445115359


In [None]:
# Save the entire model
model.save('user_item_similarity_model.h5')

# Save user tower
user_model.save('user_tower_model.h5')

# Save item tower
item_model.save('item_tower_model.h5')

In [None]:
import joblib

# Save label encoders for user_id and business_id
joblib.dump(user_id_encoder, 'user_id_encoder.pkl')
joblib.dump(business_id_encoder, 'business_id_encoder.pkl')