This is currently functioning as a POC. It is not optimized. Here are the plans for the future:
1. Categorical features will be one-hot encoded/embedded.
2. Continuous features will be normalized and handle null value.
3. Default values will be set for unseen values in the embedding layer.
4. Model will be tuned and optimized.
5. More features will be added (e.g. review content, text features, etc.)

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
# Define the database folder path and file names
db_folder = '../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db', 'yelp_user_data.db', 'yelp_tip_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [3]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
        data['user'] = pd.read_sql_query("SELECT * FROM user_data", conns[2])
        data['tip'] = pd.read_sql_query("SELECT * FROM tip_data", conns[3])
        
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()

    return data

In [4]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [5]:
# Preprocess user data
user_df = yelp_data['user']
user_df['yelping_since'] = pd.to_datetime(user_df['yelping_since'])

# Example: Extract numerical features for embedding
user_features = user_df[['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']].fillna(0)

# Preprocess business data
business_df = yelp_data['business']
business_df['is_open'] = business_df['is_open'].fillna(0).astype(int)

# Example: Extract numerical features
business_features = business_df[['stars', 'review_count', 'latitude', 'longitude']].fillna(0)


In [6]:
# Encode user_id and business_id
user_id_encoder = LabelEncoder()
business_id_encoder = LabelEncoder()

user_df['user_id_encoded'] = user_id_encoder.fit_transform(user_df['user_id'])
business_df['business_id_encoded'] = business_id_encoder.fit_transform(business_df['business_id'])

# Save number of unique users and businesses for embedding input_dim
num_users = user_df['user_id_encoded'].max() + 1
num_businesses = business_df['business_id_encoded'].max() + 1


In [None]:
# # Example: Encode 'city' as a discrete feature for businesses
# business_city_encoder = LabelEncoder()
# business_df['city_encoded'] = business_city_encoder.fit_transform(business_df['city'])

# # Save number of unique cities for embedding input_dim
# num_cities = business_df['city_encoded'].max() + 1


In [None]:
# Standardize user continuous features
user_continuous_features = user_df[['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']].fillna(0)
user_scaler = StandardScaler()
user_continuous_features_scaled = user_scaler.fit_transform(user_continuous_features)

# Standardize business continuous features
business_continuous_features = business_df[['stars', 'review_count', 'latitude', 'longitude']].fillna(0)
business_scaler = StandardScaler()
business_continuous_features_scaled = business_scaler.fit_transform(business_continuous_features)

# Ensure continuous features are pandas DataFrames
user_continuous_features_scaled = pd.DataFrame(user_continuous_features_scaled, index=user_df['user_id_encoded'])
business_continuous_features_scaled = pd.DataFrame(business_continuous_features_scaled, index=business_df['business_id_encoded'])

In [8]:
def create_embedding_layer(input_dim, output_dim, name):
    """Reusable function to create an embedding layer."""
    return layers.Embedding(input_dim=input_dim, output_dim=output_dim, name=f"{name}_embedding")

# Create embedding layers
user_id_embedding = create_embedding_layer(num_users, 16, "user_id")
business_id_embedding = create_embedding_layer(num_businesses, 16, "business_id")
# city_embedding = create_embedding_layer(num_cities, 8, "city")


In [None]:
review_df = yelp_data['review']

# Create labels for review data
review_df['label'] = (review_df['stars'] >= 4).astype(int)

# Filter out unseen user_id and business_id
review_df = review_df[
    (review_df['user_id'].isin(user_id_encoder.classes_)) & 
    (review_df['business_id'].isin(business_id_encoder.classes_))
]

# Encode user_id and business_id
review_df['user_id_encoded'] = user_id_encoder.transform(review_df['user_id'])
review_df['business_id_encoded'] = business_id_encoder.transform(review_df['business_id'])

In [9]:
def user_tower(continuous_dim):
    # Inputs
    user_id_input = layers.Input(shape=(1,), name="user_id")
    user_continuous_input = layers.Input(shape=(continuous_dim,), name="user_continuous")

    # Embedding
    user_id_embedded = user_id_embedding(user_id_input)
    user_id_embedded = layers.Flatten()(user_id_embedded)

    # Combine
    concat = layers.Concatenate()([user_id_embedded, user_continuous_input])
    x = layers.Dense(64, activation='relu')(concat)
    x = layers.Dense(32, activation='relu')(x)
    user_embedding = layers.Dense(16, activation=None, name="user_embedding")(x)

    return Model([user_id_input, user_continuous_input], user_embedding, name="UserTower")


In [10]:
def item_tower(continuous_dim):
    # Inputs
    business_id_input = layers.Input(shape=(1,), name="business_id")
    business_continuous_input = layers.Input(shape=(continuous_dim,), name="business_continuous")

    # Embedding
    business_id_embedded = business_id_embedding(business_id_input)
    business_id_embedded = layers.Flatten()(business_id_embedded)

    # Combine
    concat = layers.Concatenate()([business_id_embedded, business_continuous_input])
    x = layers.Dense(64, activation='relu')(concat)
    x = layers.Dense(32, activation='relu')(x)
    business_embedding = layers.Dense(16, activation=None, name="business_embedding")(x)

    return Model([business_id_input, business_continuous_input], business_embedding, name="ItemTower")


In [12]:
def safe_transform(encoder, values, default=-1):
    """
    Safely transform values using the encoder, assigning a default value for unseen labels.
    """
    try:
        return encoder.transform(values)
    except ValueError:
        # Use a mapping approach for unseen labels
        known_labels = set(encoder.classes_)
        transformed = [encoder.transform([v])[0] if v in known_labels else default for v in values]
        return np.array(transformed)

## The following part is for the Cosine Similarity and Binary Crossentropy. 

In [11]:
# Define cosine similarity layer
def cosine_similarity_layer():
    def cosine_sim(inputs):
        a, b = inputs
        a = tf.nn.l2_normalize(a, axis=-1)
        b = tf.nn.l2_normalize(b, axis=-1)
        return tf.reduce_sum(a * b, axis=-1, keepdims=True)
    return tf.keras.layers.Lambda(cosine_sim, name="CosineSimilarity")

In [15]:
# Instantiate towers
user_model = user_tower(user_continuous_features_scaled.shape[1])
item_model = item_tower(business_continuous_features_scaled.shape[1])

# Define inputs for user and business towers
user_inputs_model = [Input(shape=(1,), dtype=tf.int32, name="user_id_input"),
                     Input(shape=(user_continuous_features_scaled.shape[1],), name="user_cont_features_input")]

business_inputs_model = [Input(shape=(1,), dtype=tf.int32, name="business_id_input"),
                         Input(shape=(business_continuous_features_scaled.shape[1],), name="business_cont_features_input")]

# Get embeddings
user_embedding = user_model(user_inputs_model)
business_embedding = item_model(business_inputs_model)

# Compute cosine similarity
similarity = cosine_similarity_layer()([user_embedding, business_embedding])

# Define the full model
model = Model(inputs=user_inputs_model + business_inputs_model, outputs=similarity)


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
# Use indices from review_df to select rows
user_indices = review_df['user_id_encoded'].values
business_indices = review_df['business_id_encoded'].values

# Use .take or reindex based on indices
user_features = user_continuous_features_scaled.take(user_indices, axis=0)
business_features = business_continuous_features_scaled.take(business_indices, axis=0)

In [64]:
# Combine all inputs
all_user_inputs = [user_indices, user_features.values]
all_business_inputs = [business_indices, business_features.values]
all_labels = review_df['label'].values

# Split into training and testing sets
(
    train_user_indices, test_user_indices,
    train_business_indices, test_business_indices,
    train_user_features, test_user_features,
    train_business_features, test_business_features,
    train_labels, test_labels
) = train_test_split(
    user_indices, business_indices, user_features, business_features,
    all_labels, test_size=0.2, random_state=42
)

# Prepare training and testing inputs
train_user_inputs = [train_user_indices, train_user_features]
train_business_inputs = [train_business_indices, train_business_features]

test_user_inputs = [test_user_indices, test_user_features]
test_business_inputs = [test_business_indices, test_business_features]


In [47]:
model.fit(
    x=train_user_inputs + train_business_inputs,
    y=train_labels,
    batch_size=32,
    epochs=10,
    validation_split=0.2,
    verbose=1
)

Epoch 1/10
[1m  379/19609[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:31[0m 14ms/step - accuracy: 0.6995 - loss: 0.6196

KeyboardInterrupt: 

In [None]:
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(
    x=test_user_inputs + test_business_inputs,
    y=test_labels,
    verbose=1
)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

## The following part is for the Triplet Hinge Loss function. 

In [48]:
# Triplet loss function
def triplet_hinge_loss(margin=1.0):
    def loss(y_true, y_pred):
        # y_pred shape: (batch_size, 3, embedding_dim)
        anchor, positive, negative = tf.unstack(y_pred, num=3, axis=1)
        
        # Compute pairwise distances
        pos_dist = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
        neg_dist = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
        
        # Hinge loss: max(0, pos_dist - neg_dist + margin)
        return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + margin, 0.0))
    return loss

In [None]:
# Function to generate triplets
def generate_triplets(review_df, num_neg_samples=1):
    triplets = []
    grouped = review_df.groupby('user_id_encoded')

    for user_id, group in grouped:
        positive_samples = group[group['label'] == 1]
        negative_samples = group[group['label'] == 0]
        
        if positive_samples.empty or negative_samples.empty:
            continue  # Skip users without both positive and negative samples
        
        for _, pos_row in positive_samples.iterrows():
            for _, neg_row in negative_samples.sample(num_neg_samples, replace=True).iterrows():
                triplets.append((
                    user_id,
                    pos_row['business_id_encoded'],
                    neg_row['business_id_encoded']
                ))
    
    return np.array(triplets)

In [147]:
# Split review_df into train and test sets
train_df, test_df = train_test_split(review_df, test_size=0.2, random_state=42)

# Generate triplets for training and testing
train_triplets = generate_triplets(train_df)
test_triplets = generate_triplets(test_df)

In [148]:
# Prepare train and test inputs
def prepare_triplet_inputs(triplets, user_features, business_features):
    anchor_indices = triplets[:, 0]
    positive_indices = triplets[:, 1]
    negative_indices = triplets[:, 2]

    anchor_features = [anchor_indices, user_features.take(anchor_indices, axis=0).values]
    positive_features = [positive_indices, business_features.take(positive_indices, axis=0).values]
    negative_features = [negative_indices, business_features.take(negative_indices, axis=0).values]

    return [
        anchor_features[0], anchor_features[1],
        positive_features[0], positive_features[1],
        negative_features[0], negative_features[1]
    ]

train_inputs = prepare_triplet_inputs(train_triplets, user_continuous_features_scaled, business_continuous_features_scaled)
test_inputs = prepare_triplet_inputs(test_triplets, user_continuous_features_scaled, business_continuous_features_scaled)


In [149]:
# Instantiate towers
user_model = user_tower(user_continuous_features_scaled.shape[1])
item_model = item_tower(business_continuous_features_scaled.shape[1])

# Define inputs for user and business towers
user_inputs_model = [Input(shape=(1,), dtype=tf.int32, name="user_id_input"),
                     Input(shape=(user_continuous_features_scaled.shape[1],), name="user_cont_features_input")]

positive_inputs_model = [
    Input(shape=(1,), dtype=tf.int32, name="positive_id_input"),
    Input(shape=(business_continuous_features_scaled.shape[1],), name="positive_cont_features_input")
]

negative_inputs_model = [
    Input(shape=(1,), dtype=tf.int32, name="negative_id_input"),
    Input(shape=(business_continuous_features_scaled.shape[1],), name="negative_cont_features_input")
]

# Generate embeddings
anchor_embedding = user_model(user_inputs_model)
positive_embedding = item_model(positive_inputs_model)
negative_embedding = item_model(negative_inputs_model)

In [150]:
def stack_embeddings(embeddings):
    # Unpack the embeddings from the list
    anchor, positive, negative = embeddings
    return tf.stack([anchor, positive, negative], axis=1)

triplet_embeddings = Lambda(stack_embeddings, name="triplet_embeddings")(
    [anchor_embedding, positive_embedding, negative_embedding]
)

In [151]:
# Build the model
triplet_model = Model(
    inputs=[user_inputs_model, positive_inputs_model, negative_inputs_model],
    outputs=triplet_embeddings,
    name="triplet_model"
)

# Compile with triplet loss
triplet_model.compile(
    optimizer='adam',
    loss=triplet_hinge_loss(margin=0.5)
)

In [156]:
# Train the triplet model
triplet_model.fit(
    x=train_inputs,
    y=np.zeros(len(train_inputs[0])),  # Dummy labels, as loss is computed from embeddings
    batch_size=32,
    epochs=10,
    verbose=1
)


Epoch 1/10




[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 14ms/step - loss: 0.1827
Epoch 2/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 14ms/step - loss: 0.0696
Epoch 3/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 14ms/step - loss: 0.0439
Epoch 4/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 14ms/step - loss: 0.0314
Epoch 5/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 14ms/step - loss: 0.0245
Epoch 6/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 14ms/step - loss: 0.0199
Epoch 7/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 14ms/step - loss: 0.0166
Epoch 8/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 14ms/step - loss: 0.0143
Epoch 9/10
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 14ms/step - loss: 0.0130
Epoch 10/10
[1m12867/12867[0m [32m━━

<keras.src.callbacks.history.History at 0x1a009068810>

In [None]:
# Evaluate on test data
test_loss = triplet_model.evaluate(
    x=test_inputs,
    y=np.zeros(len(test_triplets)),  # Dummy labels
    batch_size=32,
    verbose=1
)
print("Test Loss:", test_loss)

# calculate the accuracy of the model
def evaluate_model(model, test_inputs):
    # Predict embeddings for test inputs
    test_embeddings = model.predict(test_inputs, batch_size=32)

    # Unstack embeddings
    anchor_embedding, positive_embedding, negative_embedding = test_embeddings[:, 0], test_embeddings[:, 1], test_embeddings[:, 2]

    # Compute cosine similarities
    positive_similarity = np.sum(anchor_embedding * positive_embedding, axis=1)
    negative_similarity = np.sum(anchor_embedding * negative_embedding, axis=1)

    # Compute accuracy
    correct = np.sum(positive_similarity > negative_similarity)
    total = len(test_inputs[0])
    return correct / total



[1m2230/2230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 827us/step - loss: 0.5697
Test Loss: 0.5699414014816284


In [None]:
# Function to get a user embedding
def get_user_embedding(user_id, user_features):
    user_input = [
        np.array([user_id]),  # user_id_encoded as an array (needs to be a batch)
        np.array([user_features])  # corresponding continuous features for the user
    ]
    user_embedding = user_model.predict(user_input)
    return user_embedding

# Function to get a business embedding
def get_business_embedding(business_id, business_features):
    business_input = [
        np.array([business_id]),  # business_id_encoded as an array (needs to be a batch)
        np.array([business_features])  # corresponding continuous features for the business
    ]
    business_embedding = item_model.predict(business_input)
    return business_embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate similarity
def calculate_similarity(user_id, business_id, user_features, business_features):
    # Get user and business embeddings
    user_emb = get_user_embedding(user_id, user_features)
    business_emb = get_business_embedding(business_id, business_features)

    # Calculate cosine similarity
    similarity = cosine_similarity(user_emb, business_emb)
    return similarity[0][0]  # Since cosine_similarity returns a 2D array

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Cosine similarity between user 201210 and business 56080: -0.12256012856960297
Actual rating: 4.0


In [None]:
user_ids = test_inputs[0]

positive_business_ids = test_inputs[2]

negative_business_ids = test_inputs[4]

# Calculate similarities for the first 10 test examples
similarities = []
actual_ratings = []
for i in range(10):
    user_id = user_ids[i]

    user_features = user_continuous_features_scaled.iloc[user_id].values  # Continuous features of the user

    positive_business_id = positive_business_ids[i] 
    positive_business_features = business_continuous_features_scaled.iloc[positive_business_id].values  # Continuous features of the business

    negative_business_id = negative_business_ids[i]
    negative_business_features = business_continuous_features_scaled.iloc[negative_business_id].values  # Continuous features of the business

    # Calculate similarities
    positive_similarity = calculate_similarity(user_id, positive_business_id, user_features, positive_business_features)

    negative_similarity = calculate_similarity(user_id, negative_business_id, user_features, negative_business_features)

    similarities.append((positive_similarity, negative_similarity))

    # Get actual ratings
    actual_positive_rating = review_df[(review_df['user_id_encoded'] == user_id) & (review_df['business_id_encoded'] == positive_business_id)]['stars'].values[0]

    actual_negative_rating = review_df[(review_df['user_id_encoded'] == user_id) & (review_df['business_id_encoded'] == negative_business_id)]['stars'].values[0]

    actual_ratings.append((actual_positive_rating, actual_negative_rating))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17

In [177]:
# Calculate accuracy
def calculate_accuracy(similarities):
    correct = 0
    # calculate the number of correct predictions, if positive similarity is greater than 0 and negative similarity is less than 0, add the count
    for pos, neg in similarities:
        if pos > 0:
            correct += 1
        if neg < 0:
            correct += 1
    return correct / (2 * len(similarities))

accuracy = calculate_accuracy(similarities)
print(f"Accuracy: {accuracy}")

Accuracy: 0.55
