In [11]:
import pandas as pd
import numpy as np
import time
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os
import mlflow

MODEL_NAME = 'all-MiniLM-L6-v2'
DATA_PATH = '../data/products.csv'
TEST_DATA_PATH = '../data/test_queries.csv'
ARTIFACTS_DIR = '../app/artifacts'

# Ensure artifacts directory exists
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

We load the data, both the official catalogue of products and the simulated users' queries dataset

In [12]:
df_products = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df_products)} products.")

df_test = pd.read_csv(TEST_DATA_PATH)
print(f"Loaded {len(df_test)} test queries.")

df_products.head()

Loaded 10 products.
Loaded 5 test queries.


Unnamed: 0,id,name,category
0,101,Bosch Professional Drill GSB 18V,Power Tools
1,102,Makita Cordless Impact Driver,Power Tools
2,103,Industrial Safety Helmet (Yellow),Safety Gear
3,104,3M Protective Safety Goggles,Safety Gear
4,105,Fluke Digital Multimeter 117,Electronics


In [13]:
print(f"Loading model: {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)

print("Generating embeddings for products...")
# We encode the 'name' column. 
# We convert to numpy array because FAISS needs it.
product_embeddings = model.encode(df_products['name'].tolist())

# Normalize vectors (Important for Cosine Similarity in FAISS)
faiss.normalize_L2(product_embeddings)

print(f"Embeddings shape: {product_embeddings.shape}") 
# (10, 384) -> 10 products, 384 dimensions

Loading model: all-MiniLM-L6-v2...


Generating embeddings for products...
Embeddings shape: (10, 384)


In [14]:
dimension = product_embeddings.shape[1] # 384 for MiniLM

# Create Index
index = faiss.IndexFlatIP(dimension) 

# Add vectors to the index
index.add(product_embeddings)

print(f"Index built. Total vectors: {index.ntotal}")

Index built. Total vectors: 10


In [15]:
# Save the FAISS index
faiss.write_index(index, f"{ARTIFACTS_DIR}/faiss_index.bin")

# Save the product dataframe (to map ID back to Name)
df_products.to_pickle(f"{ARTIFACTS_DIR}/products.pkl")

print(f"Artifacts saved to {ARTIFACTS_DIR}")

Artifacts saved to ../app/artifacts


In [None]:
# --- SAFETY CHECK: Close any stuck runs first ---
if mlflow.active_run():
    print("Ending existing run...")
    mlflow.end_run()
    
# --- CONFIGURATION ---
tracking_uri = "file:../mlruns"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("Project2_Semantic_Search")

print("Starting Final Evaluation Run...")

with mlflow.start_run():
    # 1. Setup Parameters
    k_eval = 5  # We need top 5 for Recall@5
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("k", k_eval)

    # 2. Run Inference & Measure Latency
    start_time = time.time()
    
    query_embeddings = model.encode(df_test['query'].tolist())
    faiss.normalize_L2(query_embeddings)
    D, I = index.search(query_embeddings, k_eval)
    
    inference_time = time.time() - start_time
    avg_latency = inference_time / len(df_test)
    
    # 3. Calculate Metrics
    recall_1 = 0
    recall_5 = 0
    reciprocal_ranks = []
    
    # For Calibration: We separate scores of correct vs incorrect predictions
    correct_scores = []
    incorrect_scores = []

    for i, row in df_test.iterrows():
        expected_id = row['expected_id']
        result_indices = I[i]
        result_scores = D[i]
        
        found_rank = None
        
        # Check matches
        for rank, (idx, score) in enumerate(zip(result_indices, result_scores)):
            predicted_product = df_products.iloc[idx]
            
            # Calibration Data Collection
            # We treat cosine similarity as "confidence"
            if predicted_product['id'] == expected_id:
                correct_scores.append(score)
                if found_rank is None: # First time finding it
                    found_rank = rank + 1
            else:
                incorrect_scores.append(score)

        # Calculate Recall & MRR
        if found_rank:
            reciprocal_ranks.append(1 / found_rank)
            if found_rank == 1:
                recall_1 += 1
            if found_rank <= 5:
                recall_5 += 1
        else:
            reciprocal_ranks.append(0.0)

    # 4. Final Math
    score_r1 = recall_1 / len(df_test)
    score_r5 = recall_5 / len(df_test)
    score_mrr = sum(reciprocal_ranks) / len(df_test)
    
    # Calibration Metric: 
    # A simple way to measure this: Avg Score when Correct vs Avg Score when Wrong.
    # If the model is "Calibrated", Correct Scores should be significantly higher.
    avg_conf_correct = np.mean(correct_scores) if correct_scores else 0
    avg_conf_incorrect = np.mean(incorrect_scores) if incorrect_scores else 0
    calibration_gap = avg_conf_correct - avg_conf_incorrect

    # 5. Log & Print
    print(f"{'Metric':<25} | {'Value':<10}")
    print("-" * 40)
    print(f"{'Recall@1':<25} | {score_r1:.2f}")
    print(f"{'Recall@5':<25} | {score_r5:.2f}")
    print(f"{'MRR':<25} | {score_mrr:.4f}")
    print(f"{'Avg Latency (sec)':<25} | {avg_latency:.4f}")
    print(f"{'Avg Score (Correct)':<25} | {avg_conf_correct:.4f}")
    print(f"{'Avg Score (Wrong)':<25} | {avg_conf_incorrect:.4f}")
    print("-" * 40)

    mlflow.log_metric("recall_at_1", score_r1)
    mlflow.log_metric("recall_at_5", score_r5)
    mlflow.log_metric("mrr", score_mrr)
    mlflow.log_metric("avg_latency", avg_latency)
    mlflow.log_metric("calibration_correct_score", avg_conf_correct)
    
    # Save Artifacts
    mlflow.log_artifact(f"{ARTIFACTS_DIR}/faiss_index.bin")
    mlflow.log_artifact(f"{ARTIFACTS_DIR}/products.pkl")
    
    print("✅ All metrics logged successfully!")

Starting Final Evaluation Run...
Metric                    | Value     
----------------------------------------
Recall@1                  | 1.00
Recall@5                  | 1.00
MRR                       | 1.0000
Avg Latency (sec)         | 0.0031
Avg Score (Correct)       | 0.6674
Avg Score (Wrong)         | 0.2363
----------------------------------------
✅ All metrics logged successfully!
