This is a POC for predicting user and business interest using the paper from Facebook. It is using Triplet Hinge Loss as the loss function. The goal is to separate the positive and negative samples by a margin, so that the positive samples are closer to the anchor than the negative samples. 

## Improvement Plan
### About the Model
1. Categorical features will be one-hot encoded/embedded.
2. Continuous features will be normalized and handle null value.
3. Default values will be set for unseen values in the embedding layer.
4. Model will be tuned and optimized.
5. More features will be added (e.g. review content, text features, etc.)
6. Sampling bias will be discovered
7. Adjust the dimension of each layer
### About using the Model
1. The model details will be saved and loaded.
2. How to use the model will be explained.
3. The model will be used in a web application.

In [1]:
from general_program import *

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [2]:
categories_encoder = LabelEncoder()
categories_encoder.fit(list(unique_categories))
user_id_encoder = LabelEncoder()
business_id_encoder = LabelEncoder()

user_scaler = StandardScaler()
business_scaler = StandardScaler()

In [3]:
user_df, business_df, review_df, user_continuous_features_scaled, business_continuous_features_scaled, num_users, num_businesses, num_categories = prepare_data(user_df, business_df, review_df, categories_df, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['user_id_encoded'] = user_id_encoder.transform(review_df['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['business_id_encoded'] = business_id_encoder.transform(review_df['business_id'])


In [4]:
# # Example: Encode 'city' as a discrete feature for businesses
# business_city_encoder = LabelEncoder()
# business_df['city_encoded'] = business_city_encoder.fit_transform(business_df['city'])

# # Save number of unique cities for embedding input_dim
# num_cities = business_df['city_encoded'].max() + 1

In [5]:
def create_embedding_layer(input_dim, output_dim, name):
    """Reusable function to create an embedding layer."""
    return layers.Embedding(input_dim=input_dim, output_dim=output_dim, name=f"{name}_embedding")

# Create embedding layers
user_id_embedding = create_embedding_layer(num_users, 16, "user_id")
business_id_embedding = create_embedding_layer(num_businesses, 16, "business_id")
category_embedding = create_embedding_layer(num_categories, 16, "category")

In [6]:
# Example aggregation function for TensorFlow (Mean Pooling)
def aggregate_category_embeddings(category_indices):
    category_indices = tf.constant(category_indices, dtype=tf.int32)
    embeddings = category_embedding(category_indices)
    return tf.reduce_mean(embeddings, axis=0)

In [7]:
def user_tower(continuous_dim):
    # Inputs
    user_id_input = layers.Input(shape=(1,), name="user_id")
    user_continuous_input = layers.Input(shape=(continuous_dim,), name="user_continuous")

    # Embedding
    user_id_embedded = user_id_embedding(user_id_input)
    user_id_embedded = layers.Flatten()(user_id_embedded)

    # Combine
    concat = layers.Concatenate()([user_id_embedded, user_continuous_input])
    x = layers.Dense(64, activation='relu')(concat)
    x = layers.Dense(32, activation='relu')(x)
    user_embedding = layers.Dense(16, activation=None, name="user_embedding")(x)

    return Model([user_id_input, user_continuous_input], user_embedding, name="UserTower")


In [8]:
def item_tower(continuous_dim):
    # Inputs
    business_id_input = layers.Input(shape=(1,), name="business_id")
    business_continuous_input = layers.Input(shape=(continuous_dim,), name="business_continuous")

    # Embedding
    business_id_embedded = business_id_embedding(business_id_input)
    business_id_embedded = layers.Flatten()(business_id_embedded)

    category_input = layers.Input(shape=(None,), dtype="int32", name="category_indices")  # Variable-length input
    category_embeddings = category_embedding(category_input)
    # aggregated_category_embedding = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1), name="category_pooling", output_shape=(16,))(category_embeddings)
    aggregated_category_embedding = CategoryPoolingLayer(name="category_pooling")(category_embeddings)

    # Combine
    concat = layers.Concatenate()([business_id_embedded, aggregated_category_embedding, business_continuous_input])

    x = layers.Dense(64, activation='relu')(concat)
    x = layers.Dense(32, activation='relu')(x)
    business_embedding = layers.Dense(16, activation=None, name="business_embedding")(x)

    return Model([business_id_input, category_input, business_continuous_input], business_embedding, name="ItemTower")


In [9]:
# Triplet loss function
def triplet_hinge_loss(margin=1.0):
    def loss(y_true, y_pred):
        # y_pred shape: (batch_size, 3, embedding_dim)
        anchor, positive, negative = tf.unstack(y_pred, num=3, axis=1)
        
        # Compute pairwise distances
        pos_dist = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
        neg_dist = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
        
        # Hinge loss: max(0, pos_dist - neg_dist + margin)
        return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + margin, 0.0))
    return loss

In [10]:
# Function to generate triplets
def generate_triplets(review_df, num_neg_samples=1):
    triplets = []
    grouped = review_df.groupby('user_id_encoded')

    for user_id, group in grouped:
        positive_samples = group[group['label'] == 1]
        negative_samples = group[group['label'] == 0]
        
        if positive_samples.empty or negative_samples.empty:
            continue  # Skip users without both positive and negative samples
        
        for _, pos_row in positive_samples.iterrows():
            for _, neg_row in negative_samples.sample(num_neg_samples, replace=True).iterrows():
                triplets.append((
                    user_id,
                    pos_row['business_id_encoded'],
                    neg_row['business_id_encoded']
                ))
    
    return np.array(triplets)

In [11]:
# Split review_df into train and test sets
train_df, test_df = train_test_split(review_df, test_size=0.2, random_state=42)

# Generate triplets for training and testing
train_triplets = generate_triplets(train_df)

In [12]:
# Prepare train and test inputs
def prepare_triplet_inputs(triplets, user_features, business_features, business_category_map, max_category_length=MAX_CATEGORY_LENGTH):
    # Replace NaN values with empty lists in `business_category_map`
    business_category_map = business_category_map.apply(lambda x: x if isinstance(x, list) else [])

    anchor_indices = triplets[:, 0]
    positive_indices = triplets[:, 1]
    negative_indices = triplets[:, 2]

    anchor_features = [anchor_indices, user_features.take(anchor_indices, axis=0).values]
    positive_features = [
        positive_indices,
        pad_sequences(business_category_map.loc[positive_indices].tolist(), maxlen=max_category_length, padding="post"),
        business_features.take(positive_indices, axis=0).values, 

]
    negative_features = [
        negative_indices, 
        pad_sequences(business_category_map.loc[negative_indices].tolist(), maxlen=max_category_length, padding="post"),
        business_features.take(negative_indices, axis=0).values,
    ]

    return [
        anchor_features[0], anchor_features[1],
        positive_features[0], positive_features[1], positive_features[2],
        negative_features[0], negative_features[1], negative_features[2]
    ]

business_category_map = business_df.set_index('business_id_encoded')['category_encoded']

max_category_length = MAX_CATEGORY_LENGTH

train_inputs = prepare_triplet_inputs(train_triplets, user_continuous_features_scaled, business_continuous_features_scaled, business_category_map, max_category_length)

In [13]:
# Instantiate towers
user_model = user_tower(user_continuous_features_scaled.shape[1])
item_model = item_tower(business_continuous_features_scaled.shape[1])

# Define inputs for user and business towers
user_inputs_model = [Input(shape=(1,), dtype=tf.int32, name="user_id_input"),
                     Input(shape=(user_continuous_features_scaled.shape[1],), name="user_cont_features_input")]

positive_inputs_model = [
    Input(shape=(1,), dtype=tf.int32, name="positive_id_input"),
    Input(shape=(max_category_length,), dtype=tf.int32, name="positive_category_input"),
    Input(shape=(business_continuous_features_scaled.shape[1],), name="positive_cont_features_input")
]

negative_inputs_model = [
    Input(shape=(1,), dtype=tf.int32, name="negative_id_input"),
    Input(shape=(max_category_length,), dtype=tf.int32, name="negative_category_input"),
    Input(shape=(business_continuous_features_scaled.shape[1],), name="negative_cont_features_input")
]

# Generate embeddings
anchor_embedding = user_model(user_inputs_model)
positive_embedding = item_model(positive_inputs_model)
negative_embedding = item_model(negative_inputs_model)




In [14]:
def stack_embeddings(embeddings):
    # Unpack the embeddings from the list
    anchor, positive, negative = embeddings
    return tf.stack([anchor, positive, negative], axis=1)

triplet_embeddings = Lambda(stack_embeddings, name="triplet_embeddings")(
    [anchor_embedding, positive_embedding, negative_embedding]
)

In [15]:
all_inputs = user_inputs_model + positive_inputs_model + negative_inputs_model
# Build the model
triplet_model = Model(
    inputs= all_inputs,
    outputs=triplet_embeddings,
    name="triplet_model"
)

# Compile with triplet loss
triplet_model.compile(
    optimizer='adam',
    loss=triplet_hinge_loss(margin=0.2)
)

In [16]:
# Train the triplet model
triplet_model.fit(
    x=train_inputs,
    y=np.zeros(len(train_inputs[0])),  # Dummy labels, as loss is computed from embeddings
    batch_size=32,
    # epochs=10,
    epochs=3,
    verbose=1
)


Epoch 1/3
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 14ms/step - loss: 0.1547
Epoch 2/3
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 14ms/step - loss: 0.0704
Epoch 3/3
[1m12867/12867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 14ms/step - loss: 0.0403


<keras.src.callbacks.history.History at 0x19f02d648d0>

In [17]:
save_folder_path = 'Saved_Triplet_Hinge_Loss/'

# Save the models
user_model.save(save_folder_path + 'user_model.keras')
item_model.save(save_folder_path + 'item_model.keras')

# Save the label encoders
with open(save_folder_path + 'user_id_encoder.pkl', 'wb') as f:
    pickle.dump(user_id_encoder, f)

with open(save_folder_path + 'business_id_encoder.pkl', 'wb') as f:
    pickle.dump(business_id_encoder, f)

with open(save_folder_path + 'categories_encoder.pkl', 'wb') as f:
    pickle.dump(categories_encoder, f)
    
# Save the scalers
with open(save_folder_path + 'user_scaler.pkl', 'wb') as f:
    pickle.dump(user_scaler, f)

with open(save_folder_path + 'business_scaler.pkl', 'wb') as f:
    pickle.dump(business_scaler, f)

### The following section will load the saved models and compute the predictions.

In [18]:
user_model, item_model, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler = load_saved_models()

In [19]:
# Step 1: Filter test data
test_df = test_df[
    (test_df['user_id'].isin(user_id_encoder.classes_)) & 
    (test_df['business_id'].isin(business_id_encoder.classes_))
]

# Encode user_id and business_id using the loaded encoders
test_df['user_id_encoded'] = user_id_encoder.transform(test_df['user_id'])
test_df['business_id_encoded'] = business_id_encoder.transform(test_df['business_id'])

# Step 2: Prepare inputs for user and business embeddings
# Extract user features
test_user_ids = test_df['user_id_encoded'].values
test_user_cont_features = user_scaler.transform(user_continuous_features_scaled.loc[test_user_ids].values)

# Extract business features
test_business_ids = test_df['business_id_encoded'].values
test_business_cont_features = business_scaler.transform(business_continuous_features_scaled.loc[test_business_ids].values)
test_business_categories = business_category_map.loc[test_business_ids].apply(
    lambda x: x if isinstance(x, list) else []
)
test_business_category_padded = pad_sequences(test_business_categories.tolist(), maxlen=5, padding="post")



In [20]:

# Step 3: Predict embeddings using the loaded models
test_user_embeddings = user_model.predict([test_user_ids, test_user_cont_features])
test_business_embeddings = item_model.predict([test_business_ids, test_business_category_padded, test_business_cont_features])

# Step 4: Compute cosine similarity for each user-business pair
def compute_cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Create a list of cosine similarities for each record in test_df
test_df['predicted_similarity'] = [
    compute_cosine_similarity(test_user_embeddings[i], test_business_embeddings[i])
    for i in range(len(test_df))
]

# Step 5: Set the predicted_label based on similarity score
test_df['predicted_label'] = (test_df['predicted_similarity'] >= 0).astype(int)

[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step
[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step


In [21]:

# Step 6: Evaluate the performance
accuracy = (test_df['label'] == test_df['predicted_label']).mean()
print(f"Accuracy: {accuracy:.2f}")

# Compute AUC
auc = roc_auc_score(test_df['label'], test_df['predicted_similarity'])
print(f"AUC: {auc:.2f}")

# Compute precision and recall
precision = precision_score(test_df['label'], test_df['predicted_label'])
recall = recall_score(test_df['label'], test_df['predicted_label'])

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Compute F1 score
f1 = 2 * (precision * recall) / (precision + recall)
print(f"F1 Score: {f1:.2f}")

# Compute confusion matrix
conf_matrix = confusion_matrix(test_df['label'], test_df['predicted_label'])
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.59
AUC: 0.52
Precision: 0.69
Recall: 0.74
F1 Score: 0.72
Confusion Matrix:
[[ 15472  44398]
 [ 35633 100581]]
