### Deep Structured Semantic Model - Retrieval Evaluation
This notebook is used to simulate the performance of the of the DSSM model in a retrieval setting. 

#### Pre-requisites
- The model is trained and the index is created in the notebooks `DSSM Model.ipynb` and `DSSM Index (Faiss).ipynb`.
- All required files are saved in the `Saved_Triplet_Hinge_Loss` folder.

In [1]:
from general_program import *
import faiss
import warnings
warnings.filterwarnings("ignore")

Loaded 78059 rows from business_details table.
Loaded 360656 rows from business_categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [2]:
save_folder_path = "Saved_Triplet_Hinge_Loss/"

user_model, item_model, user_id_encoder, business_id_encoder, categories_encoder, business_geohash_encoder, user_scaler, business_scaler = load_saved_models(save_folder_path=save_folder_path)

# Load the Faiss index from the file
faiss_index = faiss.read_index(save_folder_path+ "faiss_index.bin")

# Load the business_ids from the file
business_ids = np.load(save_folder_path + "business_ids.npy")





In [3]:
user_df, business_df, review_df, user_continuous_features_scaled, business_continuous_features_scaled, num_users, num_businesses, num_categories, num_geohashes = prepare_data(user_df, business_df, review_df, categories_df, user_id_encoder, business_id_encoder, categories_encoder, business_geohash_encoder, user_scaler, business_scaler, use_stage='test')

In [4]:
# Split review_df into train and test sets
train_data, test_data = train_test_split(review_df, test_size=0.2, random_state=42)

# Split the test set into positive and negative samples
positive_reviews = test_data[test_data['stars'] >= 4]
negative_reviews = test_data[test_data['stars'] < 4]

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 135929
Number of negative reviews: 60155
Total number of reviews: 196084
Ratio of positive to negative reviews: 2.26


In [5]:
def balance_test_data(positive_reviews, negative_reviews):
    # Ensure positive reviews can be downsampled without error
    if len(positive_reviews) >= len(negative_reviews):
        positive_reviews_downsampled = positive_reviews.sample(n=len(negative_reviews), random_state=42)
    else:
        print("Warning: More negative reviews than positive ones. Keeping all positives.")
        positive_reviews_downsampled = positive_reviews  # No downsampling if already balanced or reversed

    # Combine and shuffle using NumPy for better performance
    balanced_test_data = pd.concat([positive_reviews_downsampled, negative_reviews], ignore_index=True)
    balanced_test_data = balanced_test_data.iloc[np.random.permutation(len(balanced_test_data))].reset_index(drop=True)

    # Print final stats
    print(f"Number of positive reviews: {len(positive_reviews_downsampled)}")
    print(f"Number of negative reviews: {len(negative_reviews)}")
    print(f"Total number of reviews: {len(balanced_test_data)}")
    print(f"Ratio of positive to negative reviews: {len(positive_reviews_downsampled) / len(negative_reviews):.2f}")

    return balanced_test_data

In [6]:
# balance the test data, comment this line to use the original test data
test_data = balance_test_data(positive_reviews, negative_reviews)

# group the test data by user_id and get the business_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

Number of positive reviews: 60155
Number of negative reviews: 60155
Total number of reviews: 120310
Ratio of positive to negative reviews: 1.00


In [7]:
def query_top_k(user_id, user_model, faiss_index, business_ids, k=100):
    # Check if the user_id is in the user_id_encoder
    if user_id not in user_id_encoder.classes_:
        # raise ValueError("User ID is not in the encoder")
        user_id = "default_user"

    # Encode user_id and get continuous features
    user_id_encoded = user_id_encoder.transform([user_id])[0]
    user_cont_features = user_scaler.transform(
        user_continuous_features_scaled.loc[[user_id_encoded]].values
    )

    # Predict the user's embedding
    # user_embedding = user_model.predict([np.array([user_id_encoded]), user_cont_features], verbose=0)

    user_embedding = user_model.predict([user_id_encoded.reshape(1, -1), user_cont_features], verbose=0)
    user_embedding_normalized = normalize(user_embedding, axis=1)

    # Perform ANN search using Faiss
    distances, indices = faiss_index.search(user_embedding_normalized, k)

    # Return top-k businesses and distances
    top_k_business_ids = business_ids[indices.flatten()]

    # valid_indices = indices[indices != -1].flatten()
    # top_k_business_ids = business_ids[valid_indices]

    return top_k_business_ids, distances.flatten()


In [8]:
# Query top-k businesses for each user
top_k = 5000
target_users = 1000

top_k_businesses = {}
i = 0
business_ids = business_continuous_features_scaled.index.values
for user_id in test_data_grouped['user_id']:
    if user_id not in user_id_encoder.classes_:
        user_id = "default_user"
    # print(f"It is now handling {user_id}")
    encoded_business, distances = query_top_k(user_id, user_model, faiss_index, business_ids, k=top_k)
    
    # Decode the business IDs
    business_ids_decoded = business_id_encoder.inverse_transform(encoded_business)
    top_k_businesses[user_id] = business_ids_decoded
    i += 1
    if i >= target_users:
        break

In [9]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    # Create a dictionary for fast lookup: {(user_id, business_id): stars}
    test_data_lookup = {
        (row['user_id'], row['business_id']): row['stars']
        for _, row in test_data.iterrows()
    }
    
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []

    for _, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']  # List of businesses the user reviewed
        rank = None  # Default to None

        if user_id in recommendations:
            recommended_businesses = recommendations[user_id]

            for business_id in business_ids:
                star_rating = test_data_lookup.get((user_id, business_id), None)

                if star_rating is None:
                    continue  # Skip if no rating is found (safety check)

                if star_rating >= pos:
                    total_positive += 1

                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    
                    # Get rank safely
                    try:
                        rank = np.where(recommended_businesses == business_id)[0][0] + 1
                    except IndexError:
                        rank = None  # Business ID not found in recommendations
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            
            total += len(business_ids)
        
        ranks.append(rank)

    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks


In [10]:
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(top_k_businesses, test_data_grouped)

In [11]:
# Calculate evaluation metrics
accuracy = (true_positive + true_negative) / float(total) if total > 0 else 0
precision = true_positive / float(true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / float(total_positive) if total_positive > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Mean Reciprocal Rank (MRR) - safer handling
mean_reciprocal_rank = np.mean([1 / rank for rank in ranks if rank is not None and rank > 0]) if ranks else 0

# Weighted Fβ-score
beta = 2
f_beta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall) if (beta**2 * precision + recall) > 0 else 0

# Compute dataset statistics
total_negative = total - total_positive if total > 0 else 0
background_stats = pd.DataFrame({
    'Total Positive': [total_positive],
    'Total Negative': [total_negative],
    'Total': [total],
    'Ratio': [total_positive / float(total) if total > 0 else 0],
})

print("Testing Data Statistics")
display(background_stats)

# Evaluation Metrics
evaluation_metric = pd.DataFrame({
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1_score],
    'F-beta Score': [f_beta],
    'Mean Reciprocal Rank': [mean_reciprocal_rank],
}).apply(lambda x: round(x, 4))

print("Evaluation Metrics")
display(evaluation_metric)

# Confusion Matrix
confusion_matrix = pd.DataFrame({
    'True Positive': [true_positive],
    'True Negative': [true_negative],
    'False Positive': [false_positive],
    'False Negative': [false_negative]
})

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1047,1084,2131,0.491319


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.5214,0.5918,0.0831,0.1457,0.1003,0.0064


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,87,1024,60,960


In [12]:
# db_path = '../Retrieval Result/Retrieval.db'
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # Create a lookup for fast access to star ratings from the test data:
# # This dictionary maps (user_id, business_id) to the star rating.
# test_data_lookup = {
#     (row['user_id'], row['business_id']): row['stars']
#     for _, row in test_data.iterrows()
# }

# # Prepare bulk records for insertion into SQLite.
# # Format: (model, user_id, business_id, real_label)
# # Here we assume a positive review (real_label = 1) if stars >= 4, else negative (real_label = 0).
# bulk_records = []
# model_name = "DSSM"  # You can change this if needed

# for user_id, recommended_businesses in top_k_businesses.items():
#     for business_id in recommended_businesses:
#         # Check the star rating from the test data lookup
#         star_rating = test_data_lookup.get((user_id, business_id))
#         # Define the real label: 1 if rating is available and >= 4, otherwise 0.
#         real_label = 1 if star_rating is not None and star_rating >= 4 else 0
#         bulk_records.append((model_name, user_id, business_id, real_label))

# # Example: Now perform a bulk insert using SQLite's executemany.
# # Make sure your 'recommendations' table has a UNIQUE constraint on (model, user_id, business_id) if needed.
# cursor.executemany("""
#     INSERT OR IGNORE INTO recommendations (model, user_id, business_id, real_label)
#     VALUES (?, ?, ?, ?)
# """, bulk_records)
# conn.commit()

# # Close the connection
# conn.close()