In [1]:
pip install sentence-transformers datasets faiss-cpu pandas tabulate

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m117.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [2]:
import time
import pandas as pd
import numpy as np
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# ==========================================
# 1. PRÉPARATION DES DONNÉES (Dataset STSB)
# ==========================================
print("Chargement du dataset STSB...")
# On utilise la version anglaise par défaut, remplacez "en" par "fr" si besoin
dataset = load_dataset("stsb_multi_mt", "en", split="test")

# Pour simuler un moteur de recherche :
# queries = les phrases d'entrée
# corpus = la base de données dans laquelle on cherche
queries = dataset["sentence1"]
corpus = dataset["sentence2"]

# ==========================================
# 2. CONFIGURATION DES MODÈLES À TESTER
# ==========================================
# On teste 3 tailles de modèles pour comparer Précision vs Vitesse
models_to_test = [
    "all-MiniLM-L6-v2",             # Très rapide, léger
    "paraphrase-multilingual-MiniLM-L12-v2", # Supporte le français
    "all-mpnet-base-v2"             # Très précis, plus lent
]

all_results = []

# ==========================================
# 3. BOUCLE DE BENCHMARK
# ==========================================
for model_name in models_to_test:
    print(f"\n--- Évaluation du modèle : {model_name} ---")
    model = SentenceTransformer(model_name)

    # Mesure du temps d'encodage (Latence)
    start_time = time.time()
    corpus_embeddings = model.encode(corpus, convert_to_numpy=True, show_progress_bar=True)
    query_embeddings = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)
    end_time = time.time()

    total_time = end_time - start_time
    avg_latency = (total_time / (len(queries) + len(corpus))) * 1000 # en millisecondes

    # Normalisation pour la similarité cosinus (obligatoire pour FAISS IndexFlatIP)
    faiss.normalize_L2(corpus_embeddings)
    faiss.normalize_L2(query_embeddings)

    # ==========================================
    # 4. INDEXATION FAISS
    # ==========================================
    dimension = corpus_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension) # Recherche exacte par produit scalaire
    index.add(corpus_embeddings)

    # Recherche des 5 meilleurs résultats (Top-k)
    k = 5
    distances, indices = index.search(query_embeddings, k)

    # ==========================================
    # 5. CALCUL DES MÉTRIQUES (Recall & MRR)
    # ==========================================
    recall_at_1 = 0
    recall_at_5 = 0
    mrr = 0

    for i in range(len(queries)):
        # La vérité terrain : pour la requête i, on veut retrouver l'index i du corpus
        target_idx = i
        retrieved_indices = indices[i]

        # Recall@1
        if target_idx == retrieved_indices[0]:
            recall_at_1 += 1

        # Recall@5
        if target_idx in retrieved_indices:
            recall_at_5 += 1
            # MRR : 1 / position du bon résultat (1-indexed)
            rank = np.where(retrieved_indices == target_idx)[0][0] + 1
            mrr += 1 / rank

    # Moyennes finales
    num_queries = len(queries)
    all_results.append({
        "Modèle": model_name,
        "Recall@1": round(recall_at_1 / num_queries, 4),
        "Recall@5": round(recall_at_5 / num_queries, 4),
        "MRR": round(mrr / num_queries, 4),
        "Latence (ms/emb)": round(avg_latency, 2),
        "Dimensions": dimension
    })

# ==========================================
# 6. AFFICHAGE DES RÉSULTATS MLOPS
# ==========================================
df_results = pd.DataFrame(all_results)
print("\n" + "="*50)
print("TABLEAU COMPARATIF FINAL")
print("="*50)
print(df_results.to_markdown(index=False))

Chargement du dataset STSB...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

en/train-00000-of-00001.parquet:   0%|          | 0.00/470k [00:00<?, ?B/s]

en/test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

en/dev-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]


--- Évaluation du modèle : all-MiniLM-L6-v2 ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]


--- Évaluation du modèle : paraphrase-multilingual-MiniLM-L12-v2 ---


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]


--- Évaluation du modèle : all-mpnet-base-v2 ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]


TABLEAU COMPARATIF FINAL
| Modèle                                |   Recall@1 |   Recall@5 |    MRR |   Latence (ms/emb) |   Dimensions |
|:--------------------------------------|-----------:|-----------:|-------:|-------------------:|-------------:|
| all-MiniLM-L6-v2                      |     0.5627 |     0.7128 | 0.6211 |               0.71 |          384 |
| paraphrase-multilingual-MiniLM-L12-v2 |     0.5504 |     0.7186 | 0.6138 |               0.64 |          384 |
| all-mpnet-base-v2                     |     0.5555 |     0.702  | 0.6119 |               1.4  |          768 |
