### Deep Structured Semantic Model - Retrieval Evaluation
This notebook is used to simulate the performance of the of the DSSM model in a retrieval setting. 

#### Pre-requisites
- The model is trained and the index is created in the notebooks `DSSM Model.ipynb` and `DSSM Index (Faiss).ipynb`.
- All required files are saved in the `Saved_Triplet_Hinge_Loss` folder.

In [11]:
from general_program import *
import faiss
import warnings
warnings.filterwarnings("ignore")

In [17]:
save_folder_path='./Saved_Triplet_Hinge_Loss/'

user_model, item_model, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler = load_saved_models(save_folder_path)


production_folder_path='./production/'
# Load the Faiss index from the file
faiss_index = faiss.read_index(production_folder_path+ "faiss_index.bin")

# Load the business_ids from the file
business_ids = np.load(production_folder_path + "business_ids.npy")


In [3]:
user_df, business_df, review_df, user_continuous_features_scaled, business_continuous_features_scaled, num_users, num_businesses, num_categories = prepare_data(user_df, business_df, review_df, categories_df, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler, use_stage='test')

# check if business_category_map
business_category_map = business_df.set_index('business_id_encoded')['category_encoded']

In [4]:
# Split review_df into train and test sets
train_data, test_data = train_test_split(review_df, test_size=0.2, random_state=42)

# Split the test set into positive and negative samples
positive_reviews = test_data[test_data['stars'] >= 4]
negative_reviews = test_data[test_data['stars'] < 4]

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 136214
Number of negative reviews: 59870
Total number of reviews: 196084
Ratio of positive to negative reviews: 2.28


In [5]:
def balance_test_data(positive_reviews, negative_reviews):
    # down-sample the positive reviews to balance the dataset
    positive_reviews_downsampled = positive_reviews.sample(n=len(negative_reviews), random_state=42)

    # combine the down-sampled positive reviews with the negative reviews
    balanced_test_data = pd.concat([positive_reviews_downsampled, negative_reviews], ignore_index=True)

    # shuffle the balanced test data
    balanced_test_data = balanced_test_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # new statistics for the balanced test data
    positive_reviews = balanced_test_data[balanced_test_data['stars'] >= 4]
    negative_reviews = balanced_test_data[balanced_test_data['stars'] <= 2]

    print(f"Number of positive reviews: {len(positive_reviews)}")
    print(f"Number of negative reviews: {len(negative_reviews)}")
    print(f"Total number of reviews: {len(balanced_test_data)}")
    print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")
    return balanced_test_data

In [6]:
# balance the test data, comment this line to use the original test data
# test_data = balance_test_data(positive_reviews, negative_reviews)

# group the test data by user_id and get the business_id
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

In [23]:
def query_top_k(user_id, user_model, faiss_index, business_ids, k=100):
    # Check if the user_id is in the user_id_encoder
    if user_id not in user_id_encoder.classes_:
        raise ValueError("User ID is not in the encoder")

    # Encode user_id and get continuous features
    user_id_encoded = user_id_encoder.transform([user_id])[0]
    user_cont_features = user_scaler.transform(
        user_continuous_features_scaled.loc[[user_id_encoded]].values
    )

    # Predict the user's embedding
    user_embedding = user_model.predict([np.array([user_id_encoded]), user_cont_features], verbose=0)
    user_embedding_normalized = normalize(user_embedding, axis=1)

    # Perform ANN search using Faiss
    distances, indices = faiss_index.search(user_embedding_normalized, k)

    # Return top-k businesses and distances
    top_k_business_ids = business_ids[indices.flatten()]
    return top_k_business_ids, distances.flatten()


In [24]:
# Query top-k businesses for each user
top_k = 1000
num_users = 1000

top_k_businesses = {}
i = 0
business_ids = business_continuous_features_scaled.index.values
for user_id in test_data_grouped['user_id']:
    if user_id not in user_id_encoder.classes_:
        continue
    # print(f"It is now handling {user_id}")
    encoded_business, distances = query_top_k(user_id, user_model, faiss_index, business_ids, k=top_k)
    
    # Decode the business IDs
    business_ids_decoded = business_id_encoder.inverse_transform(encoded_business)
    top_k_businesses[user_id] = business_ids_decoded
    i += 1
    if i == 100:
        break

In [25]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & (test_data['business_id'] == business_id)]['stars'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    # get the rank of the business_id in the recommendations
                    rank = 0
                    # rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [26]:
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(top_k_businesses, test_data_grouped)

In [27]:
# Calculate evaluation metrics
accuracy = (true_positive + true_negative) / float(total) if total > 0 else 0
precision = true_positive / float(true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / float(total_positive) if total_positive > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Mean Reciprocal Rank (MRR) - safer handling
mean_reciprocal_rank = np.mean([1 / rank for rank in ranks if rank is not None and rank > 0]) if ranks else 0

# Weighted Fβ-score
beta = 1.5
f_beta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall) if (beta**2 * precision + recall) > 0 else 0

# Compute dataset statistics
total_negative = total - total_positive if total > 0 else 0
background_stats = pd.DataFrame({
    'Total Positive': [total_positive],
    'Total Negative': [total_negative],
    'Total': [total],
    'Ratio': [total_positive / float(total) if total > 0 else 0],
})

print("Testing Data Statistics")
display(background_stats)

# Evaluation Metrics
evaluation_metric = pd.DataFrame({
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1_score],
    'F-beta Score': [f_beta],
    'Mean Reciprocal Rank': [mean_reciprocal_rank],
}).apply(lambda x: round(x, 4))

print("Evaluation Metrics")
display(evaluation_metric)

# Confusion Matrix
confusion_matrix = pd.DataFrame({
    'True Positive': [true_positive],
    'True Negative': [true_negative],
    'False Positive': [false_positive],
    'False Negative': [false_negative]
})

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,170,79,249,0.682731


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.3213,1.0,0.0059,0.0117,0.0085,


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,1,79,0,169
