In [1]:
from general_program import *
import faiss
from sklearn.preprocessing import normalize

Loaded 78059 rows from business_details table.
Loaded 360656 rows from business_categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [2]:
user_model, item_model, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler = load_saved_models(save_folder_path='Saved_Triplet_Hinge_Loss (ver0.1)/')




In [3]:
user_df, business_df, review_df, user_continuous_features_scaled, business_continuous_features_scaled, num_users, num_businesses, num_categories = prepare_data(user_df, business_df, review_df, categories_df, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler, use_stage='test')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['user_id_encoded'] = user_id_encoder.transform(review_df['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['business_id_encoded'] = business_id_encoder.transform(review_df['business_id'])


In [4]:
business_category_map = business_df.set_index('business_id_encoded')['category_encoded']

In [5]:
# Step 1: Prepare the Faiss index for business embeddings
def create_faiss_index(item_model, business_ids, business_cont_features, business_category_map, max_category_length=MAX_CATEGORY_LENGTH):
    business_categories = business_category_map.loc[business_ids].apply(
        lambda x: x if isinstance(x, list) else []
    )
    business_category_padded = pad_sequences(business_categories.tolist(), maxlen=max_category_length, padding="post")

    # Predict embeddings
    business_embeddings = item_model.predict([business_ids, business_category_padded, business_cont_features])

    business_embeddings_normalized = normalize(business_embeddings, axis=1)
    # Create a Faiss index for cosine similarity (using inner product)
    index = faiss.IndexFlatIP(business_embeddings_normalized.shape[1])  # Assuming 16-dimensional embeddings
    index.add(business_embeddings_normalized)
    return index, business_embeddings_normalized

business_ids = business_continuous_features_scaled.index.values
faiss_index, business_embeddings_normalized = create_faiss_index(
    item_model, business_ids, 
    business_continuous_features_scaled.values, 
    business_category_map
)

[1m2440/2440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step


In [6]:
# Step 2: Query top-k businesses for a given user
def query_top_k(user_id, user_model, faiss_index, k=5):
    # Encode user_id and get continuous features
    user_id_encoded = user_id_encoder.transform([user_id])[0]
    user_cont_features = user_scaler.transform(
        user_continuous_features_scaled.loc[[user_id_encoded]].values
    )

    # Predict and normalize the user's embedding
    user_embedding = user_model.predict([np.array([user_id_encoded]), user_cont_features])
    user_embedding_normalized = normalize(user_embedding, axis=1)

    # Perform ANN search using Faiss
    distances, indices = faiss_index.search(user_embedding_normalized, k)

    # Return top-k businesses and distances
    top_k_business_ids = business_ids[indices.flatten()]
    return top_k_business_ids, distances.flatten()

In [7]:
# Step 3: Example usage
user_id = "9HQLEChkam3GMBQn0SmvVw"  # Replace with an actual user_id from your dataset
top_k_business_ids, scores = query_top_k(user_id, user_model, faiss_index, k=300)

# Decode business IDs back to their original format
decoded_business_ids = business_id_encoder.inverse_transform(top_k_business_ids)
result_df = pd.DataFrame({
    'business_id': decoded_business_ids,
    'similarity_score': scores
})

print(result_df)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
                business_id  similarity_score
0    -qtgI1xDDSqxtTtPn3ERHw          0.864202
1    CeQtgiR1EuGedqwh1uyLQQ          0.861182
2    atZ_olNKXOG4rEr6mccN8g          0.858346
3    kfW3-LmZlKrXq3RndVXxdg          0.848915
4    E8NgBaDyaVPWxmyDvHSP0g          0.848732
..                      ...               ...
295  1_ZVtdiZpBNsXaO4ObPtbw          0.709518
296  5mDneH5wP5VN_7GVe-PudQ          0.709467
297  UMHuKs1sO-wq3XqKaejXeA          0.709385
298  CBWmYHLgtFrOJs7SCcQn0g          0.709275
299  JX8KhTInMNfQVs4Fn8mSSQ          0.709046

[300 rows x 2 columns]


In [9]:
save_folder = "production/"
# Save the Faiss index to a file
faiss.write_index(faiss_index, save_folder+"faiss_index.bin")

# Save business IDs
np.save(save_folder+"business_ids.npy", business_ids)

# Save user continuous features (temporal solution)
with open(save_folder + "user_continuous_features_scaled.pkl", "wb") as f:
    pickle.dump(user_continuous_features_scaled, f)