In [13]:
from keybert import KeyBERT
import os
import zipfile
import urllib.request
from tqdm import tqdm
import spacy

import os

In [14]:
# Load spaCy and define post-processing functions
nlp = spacy.load("en_core_web_sm")

## Post-Process Filters

In [15]:
def pos_filter(keywords):
    filtered = []
    for kw, score in keywords:
        doc = nlp(kw)
        if all(token.pos_ in {"NOUN", "PROPN", "ADJ"} for token in doc):
            filtered.append((kw, score))
    return filtered

def entity_boost(keywords, text):
    doc = nlp(text)
    entities = set(ent.text for ent in doc.ents)
    boosted = []
    for kw, score in keywords:
        if kw in entities:
            boosted.append((kw, score + 0.2))
        else:
            boosted.append((kw, score))
    return boosted


def advanced_postprocess(keywords, doc_text, nlp):
    keywords = entity_boost(keywords, doc_text)
    keywords = pos_filter(keywords)

    return keywords

## Evaluation metrics

In [20]:
# Evaluation function: counts both exact and partial matches
def evaluate_results(results_post, gold_keywords):
    # Exact match
    precisions_exact, recalls_exact, f1s_exact = [], [], []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact_matches = set([p for p in pred_set if p in gold_set])
        precision = len(exact_matches) / len(pred) if pred else 0
        recall = len(exact_matches) / len(gold) if gold else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        precisions_exact.append(precision)
        recalls_exact.append(recall)
        f1s_exact.append(f1)
    avg_precision_exact = sum(precisions_exact) / len(precisions_exact)
    avg_recall_exact = sum(recalls_exact) / len(recalls_exact)
    avg_f1_exact = sum(f1s_exact) / len(f1s_exact)


    # Exact and Partial match
    precisions, recalls, f1s = [], [], []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact_matches = set([p for p in pred_set if p in gold_set])
        partial_matches = set([
            p for p in pred_set
            if any((p in g or g in p) for g in gold_set) and p not in exact_matches
        ])
        total_matches = len(exact_matches) + len(partial_matches)
        precision = total_matches / len(pred) if pred else 0
        recall = total_matches / len(gold) if gold else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    avg_precision_both = sum(precisions) / len(precisions)
    avg_recall_both = sum(recalls) / len(recalls)
    avg_f1_both = sum(f1s) / len(f1s)

    return avg_precision_exact, avg_recall_exact, avg_f1_exact, avg_precision_both, avg_recall_both, avg_f1_both

## keybert

In [21]:
# Initialize KeyBERT model
kw_model = KeyBERT()

## 500N Dataset

In [22]:
# read docs and gold keywords   

docs_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/docsutf8")
keys_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

### which post-processing function works best?

In [23]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_post = []
results_entityboost = []
results_posfilter = []
results_advanced = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        # Entity boost postprocessing
        kws_no_post = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_entityboost = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: entity_boost(kws, doc)
        )
        kws_post_entityboost = [kw for kw, _ in kws_post_entityboost]
        results_no_post.append(kws_no_post)
        results_entityboost.append(kws_post_entityboost)

        # POS filter postprocessing
        kws_post_posfilter = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws)
        )
        kws_post_posfilter = [kw for kw, _ in kws_post_posfilter]
        results_posfilter.append(kws_post_posfilter)

        # Advanced postprocessing
        kws_post_advanced = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: advanced_postprocess(kws, doc, nlp)
        )
        kws_post_advanced = [kw for kw, _ in kws_post_advanced]
        results_advanced.append(kws_post_advanced)

        
    exact_precision_no_post, exact_recall_no_post, exact_f1_no_post, both_precision_no_post, both_recall_no_post, both_f1_no_post = evaluate_results(results_no_post, gold_keywords)
    exact_precision_entityboost, exact_recall_entityboost, exact_f1_entityboost, both_precision_entityboost, both_recall_entityboost, both_f1_entityboost = evaluate_results(results_entityboost, gold_keywords)
    exact_precision_posfilter, exact_recall_posfilter, exact_f1_posfilter, both_precision_posfilter, both_recall_posfilter, both_f1_posfilter = evaluate_results(results_posfilter, gold_keywords)
    exact_precision_advanced, exact_recall_advanced, exact_f1_advanced, both_precision_advanced, both_recall_advanced, both_f1_advanced = evaluate_results(results_advanced, gold_keywords)


    print("No Post-processing: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_no_post, exact_recall_no_post, exact_f1_no_post, both_precision_no_post, both_recall_no_post, both_f1_no_post))
    
    print("Entity Boost: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_entityboost, exact_recall_entityboost, exact_f1_entityboost, both_precision_entityboost, both_recall_entityboost, both_f1_entityboost))
    
    print("POS Filter: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_posfilter, exact_recall_posfilter, exact_f1_posfilter, both_precision_posfilter, both_recall_posfilter, both_f1_posfilter))
    
    print("Advanced: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_advanced, exact_recall_advanced, exact_f1_advanced, both_precision_advanced, both_recall_advanced, both_f1_advanced))

100%|██████████| 500/500 [05:22<00:00,  1.55it/s]

No Post-processing: Exact Precision 0.449, Exact Recall 0.055, Exact F1 0.096, Both Precision 0.750, Both Recall 0.098, Both F1 0.168
Entity Boost: Exact Precision 0.449, Exact Recall 0.055, Exact F1 0.096, Both Precision 0.750, Both Recall 0.098, Both F1 0.168
POS Filter: Exact Precision 0.454, Exact Recall 0.048, Exact F1 0.084, Both Precision 0.759, Both Recall 0.086, Both F1 0.149
Advanced: Exact Precision 0.454, Exact Recall 0.048, Exact F1 0.084, Both Precision 0.759, Both Recall 0.086, Both F1 0.149





### Introducing MMR and Maxsum

In [25]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_post_mmr = []
results_post_mmr = []
results_no_post_maxsum = []
results_post_maxsum = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        kws_no_post_mmr = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_mmr=True)]
        kws_post_mmr = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_mmr=True
        )
        kws_post_mmr = [kw for kw, _ in kws_post_mmr]
        results_no_post_mmr.append(kws_no_post_mmr)
        results_post_mmr.append(kws_post_mmr)
        kws_no_post_maxsum = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_maxsum=True)]
        kws_post_maxsum = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_maxsum=True
        )
        kws_post_maxsum = [kw for kw, _ in kws_post_maxsum]
        results_no_post_maxsum.append(kws_no_post_maxsum)
        results_post_maxsum.append(kws_post_maxsum)

    # Evaluate and print results
    exact_precision_no_post_mmr, exact_recall_no_post_mmr, exact_f1_no_post_mmr, both_precision_no_post_mmr, both_recall_no_post_mmr, both_f1_no_post_mmr = evaluate_results(results_no_post_mmr, gold_keywords)
    exact_precision_post_mmr, exact_recall_post_mmr, exact_f1_post_mmr, both_precision_post_mmr, both_recall_post_mmr, both_f1_post_mmr = evaluate_results(results_post_mmr, gold_keywords)

    print("No Post-processing_MMR: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_no_post_mmr, exact_recall_no_post_mmr, exact_f1_no_post_mmr, both_precision_no_post_mmr, both_recall_no_post_mmr, both_f1_no_post_mmr))
    print("With Post-processing_MMR: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_post_mmr, exact_recall_post_mmr, exact_f1_post_mmr, both_precision_post_mmr, both_recall_post_mmr, both_f1_post_mmr))

    
    exact_precision_no_post_maxsum, exact_recall_no_post_maxsum, exact_f1_no_post_maxsum, both_precision_no_post_maxsum, both_recall_no_post_maxsum, both_f1_no_post_maxsum = evaluate_results(results_no_post_maxsum, gold_keywords)
    exact_precision_post_maxsum, exact_recall_post_maxsum, exact_f1_post_maxsum, both_precision_post_maxsum, both_recall_post_maxsum, both_f1_post_maxsum = evaluate_results(results_post_maxsum, gold_keywords)

    print("No Post-processing_MaxSum: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both  F1 {:.3f}".format(
        exact_precision_no_post_maxsum, exact_recall_no_post_maxsum, exact_f1_no_post_maxsum, both_precision_no_post_maxsum, both_recall_no_post_maxsum, both_f1_no_post_maxsum))
    print("With Post-processing_MaxSum: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_post_maxsum, exact_recall_post_maxsum, exact_f1_post_maxsum, both_precision_post_maxsum, both_recall_post_maxsum, both_f1_post_maxsum))

100%|██████████| 500/500 [04:48<00:00,  1.73it/s]

No Post-processing_MMR: Exact Precision 0.441, Exact Recall 0.053, Exact F1 0.092, Both Precision 0.712, Both Recall 0.093, Both F1 0.159
With Post-processing_MMR: Exact Precision 0.453, Exact Recall 0.044, Exact F1 0.078, Both Precision 0.733, Both Recall 0.078, Both F1 0.136
No Post-processing_MaxSum: Exact Precision 0.346, Exact Recall 0.042, Exact F1 0.073, Both Precision 0.629, Both Recall 0.084, Both  F1 0.143
With Post-processing_MaxSum: Exact Precision 0.353, Exact Recall 0.034, Exact F1 0.061, Both Precision 0.640, Both Recall 0.067, Both F1 0.118





## SemEval Dataset

In [26]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("SemEval2017", "docsutf8")
keys_dir = os.path.join("SemEval2017", "keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

### which post-processing function works best?

In [27]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_post = []
results_entityboost = []
results_posfilter = []
results_advanced = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        # Entity boost postprocessing
        kws_no_post = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_entityboost = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: entity_boost(kws, doc)
        )
        kws_post_entityboost = [kw for kw, _ in kws_post_entityboost]
        results_no_post.append(kws_no_post)
        results_entityboost.append(kws_post_entityboost)

        # POS filter postprocessing
        kws_post_posfilter = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws)
        )
        kws_post_posfilter = [kw for kw, _ in kws_post_posfilter]
        results_posfilter.append(kws_post_posfilter)

        # Advanced postprocessing
        kws_post_advanced = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: advanced_postprocess(kws, doc, nlp)
        )
        kws_post_advanced = [kw for kw, _ in kws_post_advanced]
        results_advanced.append(kws_post_advanced)

        
    exact_precision_no_post, exact_recall_no_post, exact_f1_no_post, both_precision_no_post, both_recall_no_post, both_f1_no_post = evaluate_results(results_no_post, gold_keywords)
    exact_precision_entityboost, exact_recall_entityboost, exact_f1_entityboost, both_precision_entityboost, both_recall_entityboost, both_f1_entityboost = evaluate_results(results_entityboost, gold_keywords)
    exact_precision_posfilter, exact_recall_posfilter, exact_f1_posfilter, both_precision_posfilter, both_recall_posfilter, both_f1_posfilter = evaluate_results(results_posfilter, gold_keywords)
    exact_precision_advanced, exact_recall_advanced, exact_f1_advanced, both_precision_advanced, both_recall_advanced, both_f1_advanced = evaluate_results(results_advanced, gold_keywords)


    print("No Post-processing: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_no_post, exact_recall_no_post, exact_f1_no_post, both_precision_no_post, both_recall_no_post, both_f1_no_post))
    
    print("Entity Boost: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_entityboost, exact_recall_entityboost, exact_f1_entityboost, both_precision_entityboost, both_recall_entityboost, both_f1_entityboost))
    
    print("POS Filter: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_posfilter, exact_recall_posfilter, exact_f1_posfilter, both_precision_posfilter, both_recall_posfilter, both_f1_posfilter))
    
    print("Advanced: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_advanced, exact_recall_advanced, exact_f1_advanced, both_precision_advanced, both_recall_advanced, both_f1_advanced))

100%|██████████| 493/493 [03:05<00:00,  2.66it/s]

No Post-processing: Exact Precision 0.200, Exact Recall 0.059, Exact F1 0.088, Both Precision 0.865, Both Recall 0.285, Both F1 0.415
Entity Boost: Exact Precision 0.200, Exact Recall 0.059, Exact F1 0.088, Both Precision 0.865, Both Recall 0.285, Both F1 0.415
POS Filter: Exact Precision 0.202, Exact Recall 0.049, Exact F1 0.075, Both Precision 0.872, Both Recall 0.246, Both F1 0.368
Advanced: Exact Precision 0.202, Exact Recall 0.049, Exact F1 0.075, Both Precision 0.872, Both Recall 0.246, Both F1 0.368





### Introducing MMR and Maxsum

In [28]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_post_mmr = []
results_post_mmr = []
results_no_post_maxsum = []
results_post_maxsum = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        kws_no_post_mmr = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_mmr=True)]
        kws_post_mmr = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_mmr=True
        )
        kws_post_mmr = [kw for kw, _ in kws_post_mmr]
        results_no_post_mmr.append(kws_no_post_mmr)
        results_post_mmr.append(kws_post_mmr)
        kws_no_post_maxsum = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_maxsum=True)]
        kws_post_maxsum = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_maxsum=True
        )
        kws_post_maxsum = [kw for kw, _ in kws_post_maxsum]
        results_no_post_maxsum.append(kws_no_post_maxsum)
        results_post_maxsum.append(kws_post_maxsum)

    # Evaluate and print results
    exact_precision_no_post_mmr, exact_recall_no_post_mmr, exact_f1_no_post_mmr, both_precision_no_post_mmr, both_recall_no_post_mmr, both_f1_no_post_mmr = evaluate_results(results_no_post_mmr, gold_keywords)
    exact_precision_post_mmr, exact_recall_post_mmr, exact_f1_post_mmr, both_precision_post_mmr, both_recall_post_mmr, both_f1_post_mmr = evaluate_results(results_post_mmr, gold_keywords)

    print("No Post-processing_MMR: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_no_post_mmr, exact_recall_no_post_mmr, exact_f1_no_post_mmr, both_precision_no_post_mmr, both_recall_no_post_mmr, both_f1_no_post_mmr))
    print("With Post-processing_MMR: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_post_mmr, exact_recall_post_mmr, exact_f1_post_mmr, both_precision_post_mmr, both_recall_post_mmr, both_f1_post_mmr))

    
    exact_precision_no_post_maxsum, exact_recall_no_post_maxsum, exact_f1_no_post_maxsum, both_precision_no_post_maxsum, both_recall_no_post_maxsum, both_f1_no_post_maxsum = evaluate_results(results_no_post_maxsum, gold_keywords)
    exact_precision_post_maxsum, exact_recall_post_maxsum, exact_f1_post_maxsum, both_precision_post_maxsum, both_recall_post_maxsum, both_f1_post_maxsum = evaluate_results(results_post_maxsum, gold_keywords)

    print("No Post-processing_MaxSum: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both  F1 {:.3f}".format(
        exact_precision_no_post_maxsum, exact_recall_no_post_maxsum, exact_f1_no_post_maxsum, both_precision_no_post_maxsum, both_recall_no_post_maxsum, both_f1_no_post_maxsum))
    print("With Post-processing_MaxSum: Exact Precision {:.3f}, Exact Recall {:.3f}, Exact F1 {:.3f}, Both Precision {:.3f}, Both Recall {:.3f}, Both F1 {:.3f}".format(
        exact_precision_post_maxsum, exact_recall_post_maxsum, exact_f1_post_maxsum, both_precision_post_maxsum, both_recall_post_maxsum, both_f1_post_maxsum))

100%|██████████| 493/493 [03:33<00:00,  2.31it/s]

No Post-processing_MMR: Exact Precision 0.183, Exact Recall 0.054, Exact F1 0.081, Both Precision 0.830, Both Recall 0.273, Both F1 0.398
With Post-processing_MMR: Exact Precision 0.189, Exact Recall 0.045, Exact F1 0.070, Both Precision 0.841, Both Recall 0.233, Both F1 0.350
No Post-processing_MaxSum: Exact Precision 0.154, Exact Recall 0.045, Exact F1 0.068, Both Precision 0.754, Both Recall 0.245, Both  F1 0.357
With Post-processing_MaxSum: Exact Precision 0.151, Exact Recall 0.035, Exact F1 0.055, Both Precision 0.766, Both Recall 0.200, Both F1 0.304



