In [1]:
from keybert import KeyBERT
import os
import zipfile
import urllib.request
from tqdm import tqdm
import spacy

import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load spaCy and define post-processing functions
nlp = spacy.load("en_core_web_sm")

## Post-Process Filters

In [3]:
def pos_filter(keywords):
    filtered = []
    for kw, score in keywords:
        doc = nlp(kw)
        if all(token.pos_ in {"NOUN", "PROPN", "ADJ"} for token in doc):
            filtered.append((kw, score))
    return filtered

def entity_boost(keywords, text):
    doc = nlp(text)
    entities = set(ent.text for ent in doc.ents)
    boosted = []
    for kw, score in keywords:
        if kw in entities:
            boosted.append((kw, score + 0.2))
        else:
            boosted.append((kw, score))
    return boosted


def advanced_postprocess(keywords, doc_text, nlp):
    keywords = entity_boost(keywords, doc_text)
    keywords = pos_filter(keywords)

    return keywords

## Evaluation metric

In [5]:
# Evaluation function: counts both exact and partial matches
def evaluate_results(results_post, gold_keywords):
    precisions, recalls, f1s = [], [], []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact_matches = set([p for p in pred_set if p in gold_set])
        partial_matches = set([
            p for p in pred_set
            if any((p in g or g in p) for g in gold_set) and p not in exact_matches
        ])
        total_matches = len(exact_matches) + len(partial_matches)
        precision = total_matches / len(pred) if pred else 0
        recall = total_matches / len(gold) if gold else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)
    return avg_precision, avg_recall, avg_f1

## keybert

In [6]:
# Initialize KeyBERT model
kw_model = KeyBERT()

## 500N Dataset

In [7]:
# read docs and gold keywords   

docs_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/docsutf8")
keys_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

### which post-processing function works best?

In [8]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_entityboost = []
results_entityboost = []
results_no_posfilter = []
results_posfilter = []
results_no_advanced = []
results_advanced = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        # Entity boost postprocessing
        kws_no_post_entityboost = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_entityboost = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: entity_boost(kws, doc)
        )
        kws_post_entityboost = [kw for kw, _ in kws_post_entityboost]
        results_no_entityboost.append(kws_no_post_entityboost)
        results_entityboost.append(kws_post_entityboost)

        # POS filter postprocessing
        kws_no_post_posfilter = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_posfilter = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws)
        )
        kws_post_posfilter = [kw for kw, _ in kws_post_posfilter]
        results_no_posfilter.append(kws_no_post_posfilter)
        results_posfilter.append(kws_post_posfilter)

        # Advanced postprocessing
        kws_no_post_advanced = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_advanced = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: advanced_postprocess(kws, doc, nlp)
        )
        kws_post_advanced = [kw for kw, _ in kws_post_advanced]
        results_no_advanced.append(kws_no_post_advanced)
        results_advanced.append(kws_post_advanced)

    precision_entityboost, recall_entityboost, f1_entityboost = evaluate_results(results_entityboost, gold_keywords)
    precision_no_entityboost, recall_no_entityboost, f1_no_entityboost = evaluate_results(results_no_entityboost, gold_keywords)
    precision_posfilter, recall_posfilter, f1_posfilter = evaluate_results(results_posfilter, gold_keywords)
    precision_no_posfilter, recall_no_posfilter, f1_no_posfilter = evaluate_results(results_no_posfilter, gold_keywords)
    precision_advanced, recall_advanced, f1_advanced = evaluate_results(results_advanced, gold_keywords)
    precision_no_advanced, recall_no_advanced, f1_no_advanced = evaluate_results(results_no_advanced, gold_keywords)


    print("No Post-processing Entity Boost: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_entityboost, recall_no_entityboost, f1_no_entityboost))
    print("With Post-processing Entity Boost: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
    precision_entityboost, recall_entityboost, f1_entityboost))


    precision_no_posfilter, recall_no_posfilter, f1_no_posfilter = evaluate_results(results_no_posfilter, gold_keywords)
    precision_posfilter, recall_posfilter, f1_posfilter = evaluate_results(results_posfilter, gold_keywords)

    print("No Post-processing Pos Filter: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_posfilter, recall_no_posfilter, f1_no_posfilter))
    print("With Post-processing Pos Filter: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_posfilter, recall_posfilter, f1_posfilter))
    
    precision_no_advanced, recall_no_advanced, f1_no_advanced = evaluate_results(results_no_advanced, gold_keywords)
    precision_advanced, recall_advanced, f1_advanced = evaluate_results(results_advanced, gold_keywords)
   

    print("No Post-processing Advanced: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_advanced, recall_no_advanced, f1_no_advanced))
    print("With Post-processing Advanced: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_advanced, recall_advanced, f1_advanced))



  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [07:46<00:00,  1.07it/s]

No Post-processing Entity Boost: Precision 0.750, Recall 0.098, F1 0.168
With Post-processing Entity Boost: Precision 0.750, Recall 0.098, F1 0.168
No Post-processing Pos Filter: Precision 0.750, Recall 0.098, F1 0.168
With Post-processing Pos Filter: Precision 0.759, Recall 0.086, F1 0.149
No Post-processing Advanced: Precision 0.750, Recall 0.098, F1 0.168
With Post-processing Advanced: Precision 0.759, Recall 0.086, F1 0.149





### Introducing MMR and Maxsum

In [9]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_post_mmr = []
results_post_mmr = []
results_no_post_maxsum = []
results_post_maxsum = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        kws_no_post_mmr = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_mmr=True)]
        kws_post_mmr = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_mmr=True
        )
        kws_post_mmr = [kw for kw, _ in kws_post_mmr]
        results_no_post_mmr.append(kws_no_post_mmr)
        results_post_mmr.append(kws_post_mmr)
        kws_no_post_maxsum = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_maxsum=True)]
        kws_post_maxsum = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_maxsum=True
        )
        kws_post_maxsum = [kw for kw, _ in kws_post_maxsum]
        results_no_post_maxsum.append(kws_no_post_maxsum)
        results_post_maxsum.append(kws_post_maxsum)

    # Evaluate and print results
    precision_no_post_mmr, recall_no_post_mmr, f1_no_post_mmr = evaluate_results(results_no_post_mmr, gold_keywords)
    precision_post_mmr, recall_post_mmr, f1_post_mmr = evaluate_results(results_post_mmr, gold_keywords)

    print("No Post-processing_MMR: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_post_mmr, recall_no_post_mmr, f1_no_post_mmr))
    print("With Post-processing_MMR: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_post_mmr, recall_post_mmr, f1_post_mmr))

    
    precision_no_post_maxsum, recall_no_post_maxsum, f1_no_post_maxsum = evaluate_results(results_no_post_maxsum, gold_keywords)
    precision_post_maxsum, recall_post_maxsum, f1_post_maxsum = evaluate_results(results_post_maxsum, gold_keywords)

    print("No Post-processing_MaxSum: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_post_maxsum, recall_no_post_maxsum, f1_no_post_maxsum))
    print("With Post-processing_MaxSum: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_post_maxsum, recall_post_maxsum, f1_post_maxsum))

100%|██████████| 500/500 [05:53<00:00,  1.42it/s]

No Post-processing_MMR: Precision 0.712, Recall 0.093, F1 0.159
With Post-processing_MMR: Precision 0.733, Recall 0.078, F1 0.136
No Post-processing_MaxSum: Precision 0.629, Recall 0.084, F1 0.143
With Post-processing_MaxSum: Precision 0.640, Recall 0.067, F1 0.118





## SemEval Dataset

In [10]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("SemEval2017", "docsutf8")
keys_dir = os.path.join("SemEval2017", "keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

### which post-processing function works best?

In [11]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_entityboost = []
results_entityboost = []
results_no_posfilter = []
results_posfilter = []
results_no_advanced = []
results_advanced = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        # Entity boost postprocessing
        kws_no_post_entityboost = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_entityboost = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: entity_boost(kws, doc)
        )
        kws_post_entityboost = [kw for kw, _ in kws_post_entityboost]
        results_no_entityboost.append(kws_no_post_entityboost)
        results_entityboost.append(kws_post_entityboost)

        # POS filter postprocessing
        kws_no_post_posfilter = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_posfilter = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws)
        )
        kws_post_posfilter = [kw for kw, _ in kws_post_posfilter]
        results_no_posfilter.append(kws_no_post_posfilter)
        results_posfilter.append(kws_post_posfilter)

        # Advanced postprocessing
        kws_no_post_advanced = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N)]
        kws_post_advanced = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: advanced_postprocess(kws, doc, nlp)
        )
        kws_post_advanced = [kw for kw, _ in kws_post_advanced]
        results_no_advanced.append(kws_no_post_advanced)
        results_advanced.append(kws_post_advanced)

    precision_entityboost, recall_entityboost, f1_entityboost = evaluate_results(results_entityboost, gold_keywords)
    precision_no_entityboost, recall_no_entityboost, f1_no_entityboost = evaluate_results(results_no_entityboost, gold_keywords)
    precision_posfilter, recall_posfilter, f1_posfilter = evaluate_results(results_posfilter, gold_keywords)
    precision_no_posfilter, recall_no_posfilter, f1_no_posfilter = evaluate_results(results_no_posfilter, gold_keywords)
    precision_advanced, recall_advanced, f1_advanced = evaluate_results(results_advanced, gold_keywords)
    precision_no_advanced, recall_no_advanced, f1_no_advanced = evaluate_results(results_no_advanced, gold_keywords)


    print("No Post-processing Entity Boost: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_entityboost, recall_no_entityboost, f1_no_entityboost))
    print("With Post-processing Entity Boost: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
    precision_entityboost, recall_entityboost, f1_entityboost))


    precision_no_posfilter, recall_no_posfilter, f1_no_posfilter = evaluate_results(results_no_posfilter, gold_keywords)
    precision_posfilter, recall_posfilter, f1_posfilter = evaluate_results(results_posfilter, gold_keywords)

    print("No Post-processing Pos Filter: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_posfilter, recall_no_posfilter, f1_no_posfilter))
    print("With Post-processing Pos Filter: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_posfilter, recall_posfilter, f1_posfilter))
    
    precision_no_advanced, recall_no_advanced, f1_no_advanced = evaluate_results(results_no_advanced, gold_keywords)
    precision_advanced, recall_advanced, f1_advanced = evaluate_results(results_advanced, gold_keywords)
   

    print("No Post-processing Advanced: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_advanced, recall_no_advanced, f1_no_advanced))
    print("With Post-processing Advanced: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_advanced, recall_advanced, f1_advanced))



100%|██████████| 493/493 [05:10<00:00,  1.59it/s]

No Post-processing Entity Boost: Precision 0.865, Recall 0.285, F1 0.415
With Post-processing Entity Boost: Precision 0.865, Recall 0.285, F1 0.415
No Post-processing Pos Filter: Precision 0.865, Recall 0.285, F1 0.415
With Post-processing Pos Filter: Precision 0.872, Recall 0.246, F1 0.368
No Post-processing Advanced: Precision 0.865, Recall 0.285, F1 0.415
With Post-processing Advanced: Precision 0.872, Recall 0.246, F1 0.368





### Introducing MMR and Maxsum

In [12]:
# Extract keywords with and without post-processing for the full dataset
N = 5  # Number of keywords to extract
results_no_post_mmr = []
results_post_mmr = []
results_no_post_maxsum = []
results_post_maxsum = []

if __name__ == "__main__":
    for doc in tqdm(docs):
        kws_no_post_mmr = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_mmr=True)]
        kws_post_mmr = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_mmr=True
        )
        kws_post_mmr = [kw for kw, _ in kws_post_mmr]
        results_no_post_mmr.append(kws_no_post_mmr)
        results_post_mmr.append(kws_post_mmr)
        kws_no_post_maxsum = [kw for kw, _ in kw_model.extract_keywords(doc, top_n=N, use_maxsum=True)]
        kws_post_maxsum = kw_model.extract_keywords(
            doc, top_n=N,
            postprocess=lambda kws: pos_filter(kws),
            use_maxsum=True
        )
        kws_post_maxsum = [kw for kw, _ in kws_post_maxsum]
        results_no_post_maxsum.append(kws_no_post_maxsum)
        results_post_maxsum.append(kws_post_maxsum)

    # Evaluate and print results
    precision_no_post_mmr, recall_no_post_mmr, f1_no_post_mmr = evaluate_results(results_no_post_mmr, gold_keywords)
    precision_post_mmr, recall_post_mmr, f1_post_mmr = evaluate_results(results_post_mmr, gold_keywords)

    print("No Post-processing_MMR: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_post_mmr, recall_no_post_mmr, f1_no_post_mmr))
    print("With Post-processing_MMR: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_post_mmr, recall_post_mmr, f1_post_mmr))

    
    precision_no_post_maxsum, recall_no_post_maxsum, f1_no_post_maxsum = evaluate_results(results_no_post_maxsum, gold_keywords)
    precision_post_maxsum, recall_post_maxsum, f1_post_maxsum = evaluate_results(results_post_maxsum, gold_keywords)

    print("No Post-processing_MaxSum: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_no_post_maxsum, recall_no_post_maxsum, f1_no_post_maxsum))
    print("With Post-processing_MaxSum: Precision {:.3f}, Recall {:.3f}, F1 {:.3f}".format(
        precision_post_maxsum, recall_post_maxsum, f1_post_maxsum))

100%|██████████| 493/493 [04:52<00:00,  1.69it/s]

No Post-processing_MMR: Precision 0.830, Recall 0.273, F1 0.398
With Post-processing_MMR: Precision 0.841, Recall 0.233, F1 0.350
No Post-processing_MaxSum: Precision 0.754, Recall 0.245, F1 0.357
With Post-processing_MaxSum: Precision 0.766, Recall 0.200, F1 0.304



