### Deep Structured Semantic Model - Retrieval
This notebook demonstrates how to use the trained DSSM model to retrieve similar users and businesses based on the user and business features. **It can help test if all files are in place and the model is working as expected.**

#### Pre-requisites
1. Have the processed Yelp dataset in the `../../data/processed_data/yelp_data` folder.
2. Have the virtual environment setup and used for the notebook.
3. Have all the files in the `./production` folder.

In [None]:
import numpy as np
import pandas as pd
import pickle
import faiss
from tensorflow.keras.models import load_model
from sklearn.preprocessing import normalize

In [None]:
save_folder_path = "production/"

# Load business IDs and embeddings
business_ids = np.load(save_folder_path + "business_ids.npy")

# Load the Faiss index from the file
faiss_index = faiss.read_index(save_folder_path + "faiss_index.bin")

# Load the user model
user_model = load_model(save_folder_path + 'user_model.keras')

# Load the saved label encoders
with open(save_folder_path + 'user_id_encoder.pkl', 'rb') as f:
    user_id_encoder = pickle.load(f)

with open(save_folder_path + 'business_id_encoder.pkl', 'rb') as f:
    business_id_encoder = pickle.load(f)

# Load the saved scalers
with open(save_folder_path + 'user_scaler.pkl', 'rb') as f:
    user_scaler = pickle.load(f)

# Load the saved user continuous features (temporal solution)
with open(save_folder_path + 'user_continuous_features_scaled.pkl', 'rb') as f:
    user_continuous_features_scaled = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
def query_top_k(user_id, user_model, faiss_index, business_ids, k=100):
    # Check if the user_id is in the user_id_encoder
    if user_id not in user_id_encoder.classes_:
        raise ValueError("User ID is not in the encoder")

    # Encode user_id and get continuous features
    user_id_encoded = user_id_encoder.transform([user_id])[0]
    user_cont_features = user_scaler.transform(
        user_continuous_features_scaled.loc[[user_id_encoded]].values
    )

    # Predict the user's embedding
    user_embedding = user_model.predict([np.array([user_id_encoded]), user_cont_features])
    user_embedding_normalized = normalize(user_embedding, axis=1)

    # Perform ANN search using Faiss
    distances, indices = faiss_index.search(user_embedding_normalized, k)

    # Return top-k businesses and distances
    top_k_business_ids = business_ids[indices.flatten()]
    return top_k_business_ids, distances.flatten()


In [None]:
user_id = "9HQLEChkam3GMBQn0SmvVw"  # Replace with an actual user_id
top_k_business_ids, scores = query_top_k(user_id, user_model, faiss_index, business_ids, k=100)

# Decode business IDs back to their original format
decoded_business_ids = business_id_encoder.inverse_transform(top_k_business_ids)
result_df = pd.DataFrame({
    'business_id': decoded_business_ids,
    'similarity_score': scores
})

print(result_df)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404ms/step
               business_id  similarity_score
0   -qtgI1xDDSqxtTtPn3ERHw          0.864202
1   CeQtgiR1EuGedqwh1uyLQQ          0.861182
2   atZ_olNKXOG4rEr6mccN8g          0.858346
3   kfW3-LmZlKrXq3RndVXxdg          0.848915
4   E8NgBaDyaVPWxmyDvHSP0g          0.848732
..                     ...               ...
95  qFsh80AAL90tkOc0n98bqg          0.767803
96  3CFVBCfjdCvESS1ogBv21A          0.767122
97  2Wmvi5-7LS1iw5UkOuLWlw          0.766358
98  Qe721w_WLS88SnBcu37ngg          0.766280
99  DNMDGalFejExZqwb_YVQnQ          0.765853

[100 rows x 2 columns]
