In [6]:
import pandas as pd
import numpy as np
import time
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os
import mlflow

MODELS_NAMES = ['all-MiniLM-L6-v2', "all-mpnet-base-v2"]
DATA_PATH = '../data/products.csv'
TEST_DATA_PATH = '../data/test_queries.csv'
ARTIFACTS_DIR = '../app/artifacts'

# Ensure artifacts directory exists
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

We load the data, both the official catalogue of products and the simulated users' queries dataset

In [7]:
df_products = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df_products)} products.")

df_test = pd.read_csv(TEST_DATA_PATH)
print(f"Loaded {len(df_test)} test queries.")

df_products.head()

Loaded 500 products.
Loaded 500 test queries.


Unnamed: 0,id,name,category
0,1000,Makita Impact Driver 28Pro,Power Tools
1,1001,Fluke Impact Driver 86X,Power Tools
2,1002,3M Utility Knife 68Pro,Hand Tools
3,1003,Milwaukee Tape Measure 16Pro,Hand Tools
4,1004,Knipex Face Shield 44Max,Safety Gear


In [8]:
# Close any stuck runs
if mlflow.active_run():
    print("Ending existing run...")
    mlflow.end_run()
    
tracking_uri = "file:../mlruns"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("Project2_Semantic_Search")
results_table= []
k_eval = 5

In [9]:
for model_name in MODELS_NAMES:
    # unique run name so they don't get mixed up
    with mlflow.start_run(run_name=f"Run_{model_name}"):
        print(f"Testing Model: {model_name}...")
        
        # Load Model
        model = SentenceTransformer(model_name)
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("k", k_eval)
        
        # Embed Products & Build Index
        print("Building Index...")
        product_embeddings = model.encode(df_products['name'].tolist())
        faiss.normalize_L2(product_embeddings)
        
        d = product_embeddings.shape[1]
        index = faiss.IndexFlatIP(d)
        index.add(product_embeddings)
        
        # Save artifacts locally so we can log them
        faiss.write_index(index, os.path.join(ARTIFACTS_DIR, "faiss_index.bin"))
        df_products.to_pickle(os.path.join(ARTIFACTS_DIR, "products.pkl"))

        # Evaluate
        print("Running Inference...")
        start_time = time.time()
        
        query_embeddings = model.encode(df_test['query'].tolist())
        faiss.normalize_L2(query_embeddings)
        D, I = index.search(query_embeddings, k_eval)
        
        inference_time = time.time() - start_time
        avg_latency = inference_time / len(df_test)
        
        # Calculate Detailed Metrics
        recall_1 = 0
        recall_5 = 0
        reciprocal_ranks = []
        
        # Calibration: Correct vs Incorrect scores
        correct_scores = []
        incorrect_scores = []
        
        for i, row in df_test.iterrows():
            expected_id = row['expected_id']
            result_indices = I[i]
            result_scores = D[i]
            
            found_rank = None
            
            # Check matches
            for rank, (idx, score) in enumerate(zip(result_indices, result_scores)):
                predicted_product = df_products.iloc[idx]
                
                # Calibration Data Collection
                if predicted_product['id'] == expected_id:
                    correct_scores.append(score)
                    if found_rank is None: # First time finding it
                        found_rank = rank + 1
                else:
                    incorrect_scores.append(score)

            # Calculate Recall & MRR
            if found_rank:
                reciprocal_ranks.append(1 / found_rank)
                if found_rank == 1:
                    recall_1 += 1
                if found_rank <= 5:
                    recall_5 += 1
            else:
                reciprocal_ranks.append(0.0)
        
        # Averages
        score_r1 = recall_1 / len(df_test)
        score_r5 = recall_5 / len(df_test)
        score_mrr = sum(reciprocal_ranks) / len(df_test)
        
        # Calibration Metric
        avg_conf_correct = np.mean(correct_scores) if correct_scores else 0
        avg_conf_incorrect = np.mean(incorrect_scores) if incorrect_scores else 0
        
        
        mlflow.log_metric("recall_at_1", score_r1)
        mlflow.log_metric("recall_at_5", score_r5)
        mlflow.log_metric("mrr", score_mrr)
        mlflow.log_metric("avg_latency", avg_latency)
        mlflow.log_metric("calibration_correct_score", avg_conf_correct)
        mlflow.log_metric("calibration_incorrect_score", avg_conf_incorrect)
        
        # Log Artifacts
        mlflow.log_artifact(os.path.join(ARTIFACTS_DIR, "faiss_index.bin"))
        mlflow.log_artifact(os.path.join(ARTIFACTS_DIR, "products.pkl"))
        
        results_table.append({
            "Model": model_name,
            "Recall@1": score_r1,
            "Recall@5": score_r5,
            "MRR": score_mrr,
            "Latency (s)": avg_latency,
            "Conf (Correct)": avg_conf_correct,
            "Conf (Wrong)": avg_conf_incorrect
        })


print("Table of results")
df_results = pd.DataFrame(results_table)
print(df_results)

Testing Model: all-MiniLM-L6-v2...


Building Index...
Running Inference...
Testing Model: all-mpnet-base-v2...
Building Index...
Running Inference...
Table of results
               Model  Recall@1  Recall@5       MRR  Latency (s)  \
0   all-MiniLM-L6-v2     0.336     0.822  0.516333     0.001779   
1  all-mpnet-base-v2     0.306     0.780  0.485067     0.010243   

   Conf (Correct)  Conf (Wrong)  
0        0.724082      0.652440  
1        0.693491      0.634612  


save the production model artifacsts, we chose all-MiniLM-L-v2 as it is lighter and faster.

In [10]:
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

embeddings = model.encode(df_products['name'].tolist())
faiss.normalize_L2(embeddings)

d = embeddings.shape[1]  
index = faiss.IndexFlatIP(d)
index.add(embeddings)

faiss.write_index(index, f"{ARTIFACTS_DIR}/faiss_index.bin")
df_products.to_pickle(f"{ARTIFACTS_DIR}/products.pkl")