In [2]:
import itertools
from keybert import KeyBERT
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
entity_boost_values = [0.1, 0.2, 0.3, 0.4]
allowed_pos_sets = [
    {"NOUN", "PROPN", "ADJ"},
    {"NOUN", "PROPN"},
    {"NOUN"}
]

In [4]:
# Load spaCy
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
def pos_filter(keywords, allowed_pos, nlp):
    filtered = []
    for kw, score in keywords:
        doc = nlp(kw)
        if all(token.pos_ in allowed_pos for token in doc):
            filtered.append((kw, score))
    return filtered

def entity_boost_func(keywords, doc_text, nlp, boost_value):
    doc = nlp(doc_text)
    entities = set(ent.text for ent in doc.ents)
    boosted = []
    for kw, score in keywords:
        if kw in entities:
            boosted.append((kw, score + boost_value))
        else:
            boosted.append((kw, score))
    return boosted

def advanced_postprocess(keywords, doc_text, nlp, allowed_pos, boost_value):
    keywords = entity_boost_func(keywords, doc_text, nlp, boost_value)
    keywords = pos_filter(keywords, allowed_pos, nlp)
    return keywords


In [10]:
def evaluate_results(results_post, gold_keywords):
    # Compute average exact precision and partial match precision
    exact_scores = []
    partial_scores = []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact = len(pred_set & gold_set) / len(pred_set) if pred_set else 0
        partial = sum(1 for p in pred if any(p in g or g in p for g in gold)) / len(pred) if pred else 0
        exact_scores.append(exact)
        partial_scores.append(partial)
    return sum(exact_scores)/len(exact_scores), sum(partial_scores)/len(partial_scores)

In [11]:
model_name = "all-MiniLM-L12-v2"

# 500N Dataset

In [7]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/docsutf8")
keys_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

In [13]:
best_exact_mmr  = 0
best_partial_mmr = 0
best_params_exact_mmr = None
best_params_partial_mmr = None

for entity_boost, allowed_pos in itertools.product(entity_boost_values, allowed_pos_sets):
    kw_model = KeyBERT(model_name)
    results_post = []
    for doc in tqdm(docs, desc=f"Postprocess tuning using mmr eb={entity_boost} pos={allowed_pos}"):
        kws_post = kw_model.extract_keywords(
            doc, top_n=5,
            keyphrase_ngram_range=(1, 3),
            use_mmr=True,
            nr_candidates=5,
            diversity=0.3,
            postprocess=lambda kws, doc=doc: advanced_postprocess(kws, doc, nlp, allowed_pos, entity_boost)
        )
        kws_post = [kw for kw, _ in kws_post]
        results_post.append(kws_post)
    exact_mmr, partial_mmr = evaluate_results(results_post, gold_keywords)
    if exact_mmr > best_exact_mmr:
        best_exact_mmr = exact_mmr
        best_params_exact_mmr = (entity_boost, allowed_pos)
    if partial_mmr > best_partial_mmr:
        best_partial_mmr = partial_mmr
        best_params_partial_mmr = (entity_boost, allowed_pos)

print("Best Postprocess using mmr Exact Precision:", best_exact_mmr, "Params:", best_params_exact_mmr)
print("Best Postprocess using mmr Partial Precision:", best_partial_mmr, "Params:", best_params_partial_mmr)




Postprocess tuning using mmr eb=0.1 pos={'ADJ', 'PROPN', 'NOUN'}: 100%|██████████| 500/500 [07:02<00:00,  1.18it/s]
Postprocess tuning using mmr eb=0.1 pos={'PROPN', 'NOUN'}: 100%|██████████| 500/500 [06:36<00:00,  1.26it/s]
Postprocess tuning using mmr eb=0.1 pos={'NOUN'}: 100%|██████████| 500/500 [05:21<00:00,  1.56it/s]
Postprocess tuning using mmr eb=0.2 pos={'ADJ', 'PROPN', 'NOUN'}: 100%|██████████| 500/500 [05:23<00:00,  1.55it/s]
Postprocess tuning using mmr eb=0.2 pos={'PROPN', 'NOUN'}: 100%|██████████| 500/500 [05:09<00:00,  1.62it/s]
Postprocess tuning using mmr eb=0.2 pos={'NOUN'}: 100%|██████████| 500/500 [05:09<00:00,  1.61it/s]
Postprocess tuning using mmr eb=0.3 pos={'ADJ', 'PROPN', 'NOUN'}: 100%|██████████| 500/500 [05:09<00:00,  1.62it/s]
Postprocess tuning using mmr eb=0.3 pos={'PROPN', 'NOUN'}: 100%|██████████| 500/500 [05:09<00:00,  1.61it/s]
Postprocess tuning using mmr eb=0.3 pos={'NOUN'}: 100%|██████████| 500/500 [06:51<00:00,  1.22it/s]
Postprocess tuning using 

Best Postprocess using mmr Exact Precision: 0.05439999999999997 Params: (0.1, {'ADJ', 'PROPN', 'NOUN'})
Best Postprocess using mmr Partial Precision: 0.8235333333333336 Params: (0.1, {'ADJ', 'PROPN', 'NOUN'})





In [12]:
best_exact_maxsum  = 0
best_partial_maxsum = 0
best_params_exact_maxsum = None
best_params_partial_maxsum = None

for entity_boost, allowed_pos in itertools.product(entity_boost_values, allowed_pos_sets):
    kw_model = KeyBERT(model_name)
    results_post = []
    for doc in tqdm(docs, desc=f"Postprocess tuning using maxsum eb={entity_boost} pos={allowed_pos}"):
        kws_post = kw_model.extract_keywords(
            doc, top_n=5,
            keyphrase_ngram_range=(1, 2),
            use_maxsum=True,
            nr_candidates=5,
            postprocess=lambda kws, doc=doc: advanced_postprocess(kws, doc, nlp, allowed_pos, entity_boost)
        )
        kws_post = [kw for kw, _ in kws_post]
        results_post.append(kws_post)
    exact_maxsum, partial_maxsum = evaluate_results(results_post, gold_keywords)
    if exact_maxsum > best_exact_maxsum:
        best_exact_maxsum = exact_maxsum
        best_params_exact_maxsum = (entity_boost, allowed_pos)
    if partial_maxsum > best_partial_maxsum:
        best_partial_maxsum = partial_maxsum
        best_params_partial_maxsum = (entity_boost, allowed_pos)

print("Best Postprocess using maxsum Exact Precision:", best_exact_maxsum, "Params:", best_params_exact_maxsum)
print("Best Postprocess using maxsum Partial Precision:", best_partial_maxsum, "Params:", best_params_partial_maxsum)




Postprocess tuning using maxsum eb=0.1 pos={'ADJ', 'PROPN', 'NOUN'}: 100%|██████████| 500/500 [04:04<00:00,  2.05it/s]
Postprocess tuning using maxsum eb=0.1 pos={'PROPN', 'NOUN'}: 100%|██████████| 500/500 [04:14<00:00,  1.96it/s]
Postprocess tuning using maxsum eb=0.1 pos={'NOUN'}: 100%|██████████| 500/500 [04:14<00:00,  1.97it/s]
Postprocess tuning using maxsum eb=0.2 pos={'ADJ', 'PROPN', 'NOUN'}: 100%|██████████| 500/500 [04:59<00:00,  1.67it/s]
Postprocess tuning using maxsum eb=0.2 pos={'PROPN', 'NOUN'}: 100%|██████████| 500/500 [04:38<00:00,  1.79it/s]
Postprocess tuning using maxsum eb=0.2 pos={'NOUN'}: 100%|██████████| 500/500 [03:54<00:00,  2.14it/s]
Postprocess tuning using maxsum eb=0.3 pos={'ADJ', 'PROPN', 'NOUN'}: 100%|██████████| 500/500 [03:59<00:00,  2.09it/s]
Postprocess tuning using maxsum eb=0.3 pos={'PROPN', 'NOUN'}: 100%|██████████| 500/500 [05:28<00:00,  1.52it/s]
Postprocess tuning using maxsum eb=0.3 pos={'NOUN'}: 100%|██████████| 500/500 [03:56<00:00,  2.11it/s

Best Postprocess using maxsum Exact Precision: 0.16536666666666686 Params: (0.1, {'ADJ', 'PROPN', 'NOUN'})
Best Postprocess using maxsum Partial Precision: 0.8241666666666669 Params: (0.1, {'ADJ', 'PROPN', 'NOUN'})





# SemEval 2017 Dataset

In [1]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("SemEval2017", "docsutf8")
keys_dir = os.path.join("SemEval2017", "keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

In [12]:
best_exact_mmr  = 0
best_partial_mmr = 0
best_params_exact_mmr = None
best_params_partial_mmr = None

for entity_boost, allowed_pos in itertools.product(entity_boost_values, allowed_pos_sets):
    kw_model = KeyBERT(model_name)
    results_post = []
    for doc in tqdm(docs, desc=f"Postprocess tuning using mmr eb={entity_boost} pos={allowed_pos}"):
        kws_post = kw_model.extract_keywords(
            doc, top_n=5,
            keyphrase_ngram_range=(1, 3),
            use_mmr=True,
            nr_candidates=5,
            diversity=0.3,
            postprocess=lambda kws, doc=doc: advanced_postprocess(kws, doc, nlp, allowed_pos, entity_boost)
        )
        kws_post = [kw for kw, _ in kws_post]
        results_post.append(kws_post)
    exact_mmr, partial_mmr = evaluate_results(results_post, gold_keywords)
    if exact_mmr > best_exact_mmr:
        best_exact_mmr = exact_mmr
        best_params_exact_mmr = (entity_boost, allowed_pos)
    if partial_mmr > best_partial_mmr:
        best_partial_mmr = partial_mmr
        best_params_partial_mmr = (entity_boost, allowed_pos)

print("Best Postprocess using mmr Exact Precision:", best_exact_mmr, "Params:", best_params_exact_mmr)
print("Best Postprocess using mmr Partial Precision:", best_partial_mmr, "Params:", best_params_partial_mmr)




Postprocess tuning using mmr eb=0.1 pos={'PROPN', 'NOUN', 'ADJ'}: 100%|██████████| 493/493 [03:39<00:00,  2.24it/s]
Postprocess tuning using mmr eb=0.1 pos={'PROPN', 'NOUN'}: 100%|██████████| 493/493 [02:49<00:00,  2.90it/s]
Postprocess tuning using mmr eb=0.1 pos={'NOUN'}: 100%|██████████| 493/493 [02:50<00:00,  2.89it/s]
Postprocess tuning using mmr eb=0.2 pos={'PROPN', 'NOUN', 'ADJ'}: 100%|██████████| 493/493 [03:20<00:00,  2.45it/s]
Postprocess tuning using mmr eb=0.2 pos={'PROPN', 'NOUN'}: 100%|██████████| 493/493 [03:03<00:00,  2.69it/s]
Postprocess tuning using mmr eb=0.2 pos={'NOUN'}: 100%|██████████| 493/493 [02:50<00:00,  2.89it/s]
Postprocess tuning using mmr eb=0.3 pos={'PROPN', 'NOUN', 'ADJ'}: 100%|██████████| 493/493 [02:48<00:00,  2.92it/s]
Postprocess tuning using mmr eb=0.3 pos={'PROPN', 'NOUN'}: 100%|██████████| 493/493 [02:57<00:00,  2.79it/s]
Postprocess tuning using mmr eb=0.3 pos={'NOUN'}: 100%|██████████| 493/493 [02:57<00:00,  2.77it/s]
Postprocess tuning using 

Best Postprocess using mmr Exact Precision: 0.1504394861392834 Params: (0.1, {'PROPN', 'NOUN', 'ADJ'})
Best Postprocess using mmr Partial Precision: 0.6712305611899937 Params: (0.1, {'PROPN', 'NOUN', 'ADJ'})





In [14]:
best_exact_maxsum  = 0
best_partial_maxsum = 0
best_params_exact_maxsum = None
best_params_partial_maxsum = None

for entity_boost, allowed_pos in itertools.product(entity_boost_values, allowed_pos_sets):
    kw_model = KeyBERT(model_name)
    results_post = []
    for doc in tqdm(docs, desc=f"Postprocess tuning using maxsum eb={entity_boost} pos={allowed_pos}"):
        kws_post = kw_model.extract_keywords(
            doc, top_n=5,
            keyphrase_ngram_range=(1, 2),
            use_maxsum=True,
            nr_candidates=5,
            postprocess=lambda kws, doc=doc: advanced_postprocess(kws, doc, nlp, allowed_pos, entity_boost)
        )
        kws_post = [kw for kw, _ in kws_post]
        results_post.append(kws_post)
    exact_maxsum, partial_maxsum = evaluate_results(results_post, gold_keywords)
    if exact_maxsum > best_exact_maxsum:
        best_exact_maxsum = exact_maxsum
        best_params_exact_maxsum = (entity_boost, allowed_pos)
    if partial_maxsum > best_partial_maxsum:
        best_partial_maxsum = partial_maxsum
        best_params_partial_maxsum = (entity_boost, allowed_pos)

print("Best Postprocess using maxsum Exact Precision:", best_exact_maxsum, "Params:", best_params_exact_maxsum)
print("Best Postprocess using maxsum Partial Precision:", best_partial_maxsum, "Params:", best_params_partial_maxsum)

Postprocess tuning using maxsum eb=0.1 pos={'PROPN', 'NOUN', 'ADJ'}: 100%|██████████| 493/493 [02:05<00:00,  3.94it/s]
Postprocess tuning using maxsum eb=0.1 pos={'PROPN', 'NOUN'}: 100%|██████████| 493/493 [02:02<00:00,  4.02it/s]
Postprocess tuning using maxsum eb=0.1 pos={'NOUN'}: 100%|██████████| 493/493 [02:01<00:00,  4.07it/s]
Postprocess tuning using maxsum eb=0.2 pos={'PROPN', 'NOUN', 'ADJ'}: 100%|██████████| 493/493 [02:01<00:00,  4.05it/s]
Postprocess tuning using maxsum eb=0.2 pos={'PROPN', 'NOUN'}: 100%|██████████| 493/493 [01:58<00:00,  4.15it/s]
Postprocess tuning using maxsum eb=0.2 pos={'NOUN'}: 100%|██████████| 493/493 [02:04<00:00,  3.95it/s]
Postprocess tuning using maxsum eb=0.3 pos={'PROPN', 'NOUN', 'ADJ'}: 100%|██████████| 493/493 [02:05<00:00,  3.92it/s]
Postprocess tuning using maxsum eb=0.3 pos={'PROPN', 'NOUN'}: 100%|██████████| 493/493 [02:05<00:00,  3.92it/s]
Postprocess tuning using maxsum eb=0.3 pos={'NOUN'}: 100%|██████████| 493/493 [02:06<00:00,  3.90it/s

Best Postprocess using maxsum Exact Precision: 0.2392832995267073 Params: (0.1, {'PROPN', 'NOUN', 'ADJ'})
Best Postprocess using maxsum Partial Precision: 0.7591615956727529 Params: (0.1, {'PROPN', 'NOUN', 'ADJ'})



