In [None]:
!pip install nbimporter

In [1]:
import os
import time
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [22]:
# List of embedding models to compare
embedding_models = [
    'paraphrase-MiniLM-L6-v2',
    'paraphrase-MiniLM-L12-v2',
    'all-MiniLM-L6-v2',
    'all-MiniLM-L12-v2',
    'paraphrase-albert-small-v2'
]

# Your best configurations
best_config_mmr = {
    'ngram_range': (1, 2),
    'nr_candidates': 18,
    'diversity': 0.4
}

best_config_maxsum = {
    'ngram_range': (1, 2),
    'nr_candidates': 18
}


In [3]:
# Evaluation function: counts both exact and partial matches
def evaluate_results(results_post, gold_keywords):
    precisions, recalls, f1s = [], [], []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact_matches = set([p for p in pred_set if p in gold_set])
        partial_matches = set([
            p for p in pred_set
            if any((p in g or g in p) for g in gold_set) and p not in exact_matches
        ])
        total_matches = len(exact_matches) + len(partial_matches)
        precision = total_matches / len(pred) if pred else 0
        recall = total_matches / len(gold) if gold else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)
    return avg_precision, avg_recall, avg_f1

## K500N

In [23]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/docsutf8")
keys_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

In [10]:
for model_name in embedding_models:
    print(f"\nEvaluating model: {model_name}")
    sentence_model = SentenceTransformer(model_name)
    kw_model = KeyBERT(model=sentence_model)
    
    
    results = []
    for doc in docs:
        keywords = kw_model.extract_keywords(
            doc,
        )
        keywords = [k[0] for k in keywords]
        results.append(keywords)
    
    precision, recall, f1 = evaluate_results(results, gold_keywords)
    print(f"Precision_500N: {precision:.3f}")
    print(f"Recall_500N: {recall:.3f}")
    print(f"F1_500N: {f1:.3f}")



Evaluating model: paraphrase-MiniLM-L6-v2
Precision_500N: 0.763
Recall_500N: 0.100
F1_500N: 0.170

Evaluating model: paraphrase-MiniLM-L12-v2
Precision_500N: 0.748
Recall_500N: 0.097
F1_500N: 0.166

Evaluating model: all-MiniLM-L6-v2
Precision_500N: 0.750
Recall_500N: 0.098
F1_500N: 0.168

Evaluating model: all-MiniLM-L12-v2
Precision_500N: 0.767
Recall_500N: 0.100
F1_500N: 0.171

Evaluating model: paraphrase-albert-small-v2
Precision_500N: 0.760
Recall_500N: 0.100
F1_500N: 0.170


### Best Final Config

In [15]:
kw_model = KeyBERT(model='all-MiniLM-L12-v2')
    
# Choose configuration: mmr_config or maxsum_config
config = best_config_mmr
    
results = []
for doc in docs:
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=config['ngram_range'],
        stop_words='english',
        nr_candidates=config['nr_candidates'],
        diversity=config['diversity'],
        top_n=5
    )
    keywords = [k[0] for k in keywords]
    results.append(keywords)
    
precision, recall, f1 = evaluate_results(results, gold_keywords)
print(f"Precision_500N_mmr: {precision:.3f}")
print(f"Recall_500N_mmr: {recall:.3f}")
print(f"F1_500N_mmr: {f1:.3f}")


Precision_500N_mmr: 0.830
Recall_500N_mmr: 0.106
F1_500N_mmr: 0.182


In [24]:
kw_model = KeyBERT(model='all-MiniLM-L12-v2')
    
# Choose configuration: mmr_config or maxsum_config
config = best_config_maxsum
    
results = []
for doc in docs:
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=config['ngram_range'],
        stop_words='english',
        nr_candidates=config['nr_candidates'],
        top_n=5
    )
    keywords = [k[0] for k in keywords]
    results.append(keywords)
    
precision, recall, f1 = evaluate_results(results, gold_keywords)
print(f"Precision_500N_maxsum: {precision:.3f}")
print(f"Recall_500N_maxsum: {recall:.3f}")
print(f"F1_500N_maxsum: {f1:.3f}")


Precision_500N_maxsum: 0.830
Recall_500N_maxsum: 0.106
F1_500N_maxsum: 0.182


## SemEval2017

In [17]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("SemEval2017", "docsutf8")
keys_dir = os.path.join("SemEval2017", "keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

In [18]:
for model_name in embedding_models:
    print(f"\nEvaluating model: {model_name}")
    sentence_model = SentenceTransformer(model_name)
    kw_model = KeyBERT(model=sentence_model)
    
    
    results = []
    for doc in docs:
        keywords = kw_model.extract_keywords(
            doc,
        )
        keywords = [k[0] for k in keywords]
        results.append(keywords)
    
    precision, recall, f1 = evaluate_results(results, gold_keywords)
    print(f"Precision_SemEval2017: {precision:.3f}")
    print(f"Recall_SemEval2017: {recall:.3f}")
    print(f"F1_SemEval2017: {f1:.3f}")



Evaluating model: paraphrase-MiniLM-L6-v2
Precision_SemEval2017: 0.832
Recall_SemEval2017: 0.277
F1_SemEval2017: 0.401

Evaluating model: paraphrase-MiniLM-L12-v2
Precision_SemEval2017: 0.849
Recall_SemEval2017: 0.280
F1_SemEval2017: 0.407

Evaluating model: all-MiniLM-L6-v2
Precision_SemEval2017: 0.865
Recall_SemEval2017: 0.285
F1_SemEval2017: 0.415

Evaluating model: all-MiniLM-L12-v2
Precision_SemEval2017: 0.873
Recall_SemEval2017: 0.290
F1_SemEval2017: 0.420

Evaluating model: paraphrase-albert-small-v2
Precision_SemEval2017: 0.840
Recall_SemEval2017: 0.278
F1_SemEval2017: 0.403


In [19]:
# Your best configurations
best_config_mmr = {
    'ngram_range': (1, 1),
    'nr_candidates': 18,
    'diversity': 0.4
}

best_config_maxsum = {
    'ngram_range': (1, 1),
    'nr_candidates': 18
}

### Best Final Config

In [20]:
kw_model = KeyBERT(model='all-MiniLM-L12-v2')
    
# Choose configuration: mmr_config or maxsum_config
config = best_config_mmr
    
results = []
for doc in docs:
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=config['ngram_range'],
        stop_words='english',
        nr_candidates=config['nr_candidates'],
        diversity=config['diversity'],
        top_n=5
    )
    keywords = [k[0] for k in keywords]
    results.append(keywords)
    
precision, recall, f1 = evaluate_results(results, gold_keywords)
print(f"Precision_SemEval2017_mmr: {precision:.3f}")
print(f"Recall_SemEval2017_mmr: {recall:.3f}")
print(f"F1_SemEval2017_mmr: {f1:.3f}")


Precision_SemEval2017_mmr: 0.873
Recall_SemEval2017_mmr: 0.290
F1_SemEval2017_mmr: 0.420


In [21]:
kw_model = KeyBERT(model='all-MiniLM-L12-v2')
    
# Choose configuration: mmr_config or maxsum_config
config = best_config_maxsum
    
results = []
for doc in docs:
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=config['ngram_range'],
        stop_words='english',
        nr_candidates=config['nr_candidates'],
        top_n=5
    )
    keywords = [k[0] for k in keywords]
    results.append(keywords)
    
precision, recall, f1 = evaluate_results(results, gold_keywords)
print(f"Precision_SemEval2017_maxsum: {precision:.3f}")
print(f"Recall_SemEval2017_maxsum: {recall:.3f}")
print(f"F1_SemEval2017_maxsum: {f1:.3f}")


Precision_SemEval2017_maxsum: 0.873
Recall_SemEval2017_maxsum: 0.290
F1_SemEval2017_maxsum: 0.420
