In [27]:
!pip install pandas requests tqdm lxml

Collecting lxml
  Downloading lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (6.6 kB)
Downloading lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-6.0.0


## Download the dataset

In [48]:
import os
import zipfile
import requests
import pandas as pd
from tqdm import tqdm

#  Working OPUS download link
url = "https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-fr.txt.zip"
filename = "en-fr.txt.zip"
extracted_folder = "en-fr-data"

# Step 1: Download the zip file
if not os.path.exists(filename):
    print(" Downloading en-fr TED2020 from OPUS...")
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(filename, "wb") as f:
            for chunk in tqdm(r.iter_content(chunk_size=8192), desc="Downloading"):
                f.write(chunk)
else:
    print(" Already downloaded.")

# Step 2: Extract it
if not os.path.exists(extracted_folder):
    os.makedirs(extracted_folder)
    print(" Extracting...")
    with zipfile.ZipFile(filename, "r") as zip_ref:
        zip_ref.extractall(extracted_folder)
else:
    print(" Already extracted.")

# Step 3: Load the English and French files
en_path = os.path.join(extracted_folder, "TED2020.en-fr.en")
fr_path = os.path.join(extracted_folder, "TED2020.en-fr.fr")

if not os.path.exists(en_path) or not os.path.exists(fr_path):
    raise FileNotFoundError(" Missing extracted files.")

# Load and clean
with open(en_path, "r", encoding="utf-8") as f:
    en_lines = [line.strip() for line in f.readlines()]
with open(fr_path, "r", encoding="utf-8") as f:
    fr_lines = [line.strip() for line in f.readlines()]

df = pd.DataFrame({"en": en_lines, "fr": fr_lines}).dropna()
df = df.sample(20, random_state=42).reset_index(drop=True)

# Fake gold keywords
def fake_gold(text):
    return [w.lower() for w in text.split()[:4] if len(w) > 3]

df["gold_fr"] = df["fr"].apply(fake_gold)

print(" Sample data ready:")
print(df[["fr", "gold_fr"]].head(3))

 Downloading en-fr TED2020 from OPUS...


Downloading: 3663it [00:04, 841.45it/s]


 Already extracted.
 Sample data ready:
                                                  fr  \
0                  Ma famille vivait dans une hutte.   
1  Pourquoi, en tant qu'architecte, vous intéress...   
2  Et certains d'entre nous pourraient même chanter.   

                             gold_fr  
0            [famille, vivait, dans]  
1  [pourquoi,, tant, qu'architecte,]  
2          [certains, d'entre, nous]  


In [49]:

# ---------------------- 🔍 Print 3 Example Predictions ----------------------
print("\n Samples of dataset:\n")
for i in range(3):
    print(f"Sentence   : {df['fr'][i]}")
    print(f"Gold       : {df['gold_fr'][i]}")


 Samples of dataset:

Sentence   : Ma famille vivait dans une hutte.
Gold       : ['famille', 'vivait', 'dans']
Sentence   : Pourquoi, en tant qu'architecte, vous intéresseriez-vous à l'espace ?
Gold       : ['pourquoi,', 'tant', "qu'architecte,"]
Sentence   : Et certains d'entre nous pourraient même chanter.
Gold       : ['certains', "d'entre", 'nous']


## Evaluation

In [40]:
# Evaluation function: counts both exact and partial matches
def evaluate_results(results_post, gold_keywords):
    precisions, recalls, f1s = [], [], []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact_matches = set([p for p in pred_set if p in gold_set])
        partial_matches = set([
            p for p in pred_set
            if any((p in g or g in p) for g in gold_set) and p not in exact_matches
        ])
        total_matches = len(exact_matches) + len(partial_matches)
        precision = total_matches / len(pred) if pred else 0
        recall = total_matches / len(gold) if gold else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)
    return avg_precision, avg_recall, avg_f1

## Baseline

In [None]:
import time
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from tqdm import tqdm

results_default = {}

print(f"\n Testing model: {'all-MiniLM-L12-v2'}")
start_time = time.time()

model = SentenceTransformer('all-MiniLM-L12-v2')
kw_model = KeyBERT(model)

predictions = []
for sentence in tqdm(df["fr"], desc="Extracting keywords"):
    keywords = kw_model.extract_keywords(sentence, stop_words=None, top_n=5)
    extracted = [kw[0].lower() for kw in keywords]
    predictions.append(extracted)

precision, recall, f1 = evaluate_results(predictions, df["gold_fr"].tolist())
runtime = time.time() - start_time

results_default['all-MiniLM-L12-v2'] = {
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "time_sec": runtime
}

print(results_default)


🔍 Testing model: all-MiniLM-L12-v2


Extracting keywords: 100%|██████████| 20/20 [00:02<00:00,  7.60it/s]

{'all-MiniLM-L12-v2': {'precision': 0.33, 'recall': 0.675, 'f1': 0.42400793650793644, 'time_sec': 5.289292812347412}}





## MultiLingual Embedding models

In [38]:
# ------------------ Embedding Models ------------------
multilingual_embedding_models = [
    "paraphrase-multilingual-MiniLM-L12-v2", 
    "distiluse-base-multilingual-cased-v1", 
    "distiluse-base-multilingual-cased-v2" 
]


In [50]:
# ------------------ Run Benchmark ------------------
results = {}

for model_name in multilingual_embedding_models:
    print(f"\n Testing model: {model_name}")
    start_time = time.time()

    model = SentenceTransformer(model_name)
    kw_model = KeyBERT(model)

    predictions = []
    for sentence in tqdm(df["fr"], desc="Extracting keywords"):
        keywords = kw_model.extract_keywords(sentence, stop_words=None, top_n=5)
        extracted = [kw[0].lower() for kw in keywords]
        predictions.append(extracted)

    precision, recall, f1 = evaluate_results(predictions, df["gold_fr"].tolist())
    runtime = time.time() - start_time

    results[model_name] = {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "time_sec": runtime
    }

# ------------------ Summary ------------------
print("\n Summary of Model Performance:\n")
for model, metrics in results.items():
    print(f"{model:50} | Precision: {metrics['precision']:.3f} | Recall: {metrics['recall']:.3f} | F1: {metrics['f1']:.3f} | Time: {metrics['time_sec']:.1f}s")


 Testing model: paraphrase-multilingual-MiniLM-L12-v2


Extracting keywords: 100%|██████████| 20/20 [00:03<00:00,  6.43it/s]



 Testing model: distiluse-base-multilingual-cased-v1


Extracting keywords: 100%|██████████| 20/20 [00:01<00:00, 11.25it/s]



 Testing model: distiluse-base-multilingual-cased-v2


Extracting keywords: 100%|██████████| 20/20 [00:01<00:00, 10.91it/s]


 Summary of Model Performance:

paraphrase-multilingual-MiniLM-L12-v2              | Precision: 0.330 | Recall: 0.637 | F1: 0.419 | Time: 16.0s
distiluse-base-multilingual-cased-v1               | Precision: 0.380 | Recall: 0.729 | F1: 0.483 | Time: 5.6s
distiluse-base-multilingual-cased-v2               | Precision: 0.330 | Recall: 0.638 | F1: 0.419 | Time: 6.0s



