In [1]:
import itertools
from test_extension import advanced_postprocess
from keybert import KeyBERT
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Parameter grids
ngram_ranges = [(1,2), (1,3), (2,2), (2,3), (3,3)]
nr_candidates_list = [5, 6, 7, 8]
diversity_params = [0.2, 0.3, 0.4, 0.5, 0.6]
model_name = "all-MiniLM-L12-v2"
N = 5  # Number of keywords to extract

In [3]:
# Load spaCy
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
def evaluate_results(results_post, gold_keywords):
    # Compute average exact precision and partial match precision
    exact_scores = []
    partial_scores = []
    for pred, gold in zip(results_post, gold_keywords):
        pred_set = set(pred)
        gold_set = set(gold)
        exact = len(pred_set & gold_set) / len(pred_set) if pred_set else 0
        partial = sum(1 for p in pred if any(p in g or g in p for g in gold)) / len(pred) if pred else 0
        exact_scores.append(exact)
        partial_scores.append(partial)
    return sum(exact_scores)/len(exact_scores), sum(partial_scores)/len(partial_scores)

In [6]:
def run_experiment(use_mmr=False, use_maxsum=False):
    best_exact = 0
    best_partial = 0
    best_params_exact = None
    best_params_partial = None

    for ngram_range, nr_candidates in itertools.product(ngram_ranges, nr_candidates_list):
        if use_mmr:
            for diversity in diversity_params:
                kw_model = KeyBERT(model_name)
                results_post = []
                for doc in tqdm(docs, desc=f"MMR ngram={ngram_range} nc={nr_candidates} div={diversity}"):
                    kws_post = kw_model.extract_keywords(
                        doc, top_n=N,
                        keyphrase_ngram_range=ngram_range,
                        use_mmr=True,
                        diversity=diversity,
                        nr_candidates=nr_candidates,
                        postprocess=lambda kws: advanced_postprocess(kws, doc, nlp)
                    )
                    kws_post = [kw for kw, _ in kws_post]
                    results_post.append(kws_post)
                # Evaluate
                exact, partial = evaluate_results(results_post, gold_keywords)
                if exact > best_exact:
                    best_exact = exact
                    best_params_exact = (ngram_range, nr_candidates, diversity)
                if partial > best_partial:
                    best_partial = partial
                    best_params_partial = (ngram_range, nr_candidates, diversity)
        elif use_maxsum:
            kw_model = KeyBERT(model_name)
            results_post = []
            for doc in tqdm(docs, desc=f"MaxSum ngram={ngram_range} nc={nr_candidates}"):
                kws_post = kw_model.extract_keywords(
                    doc, top_n=N,
                    keyphrase_ngram_range=ngram_range,
                    use_maxsum=True,
                    nr_candidates=nr_candidates,
                    postprocess=lambda kws: advanced_postprocess(kws, doc, nlp)
                )
                kws_post = [kw for kw, _ in kws_post]
                results_post.append(kws_post)
            # Evaluate
            exact, partial = evaluate_results(results_post, gold_keywords)
            if exact > best_exact:
                best_exact = exact
                best_params_exact = (ngram_range, nr_candidates)
            if partial > best_partial:
                best_partial = partial
                best_params_partial = (ngram_range, nr_candidates)
    return best_exact, best_params_exact, best_partial, best_params_partial

## 500N dataset

In [4]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/docsutf8")
keys_dir = os.path.join("500N-KPCrowd-v1.1", "500N-KPCrowd-v1.1/keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

In [11]:
# Run MMR tuning
best_exact_mmr, best_params_exact_mmr, best_partial_mmr, best_params_partial_mmr = run_experiment(use_mmr=True)

MMR ngram=(1, 2) nc=5 div=0.2: 100%|██████████| 500/500 [04:18<00:00,  1.93it/s]
MMR ngram=(1, 2) nc=5 div=0.3: 100%|██████████| 500/500 [04:14<00:00,  1.97it/s]
MMR ngram=(1, 2) nc=5 div=0.4: 100%|██████████| 500/500 [03:57<00:00,  2.11it/s]
MMR ngram=(1, 2) nc=5 div=0.5: 100%|██████████| 500/500 [03:59<00:00,  2.09it/s]
MMR ngram=(1, 2) nc=5 div=0.6: 100%|██████████| 500/500 [04:10<00:00,  2.00it/s]
MMR ngram=(1, 2) nc=6 div=0.2: 100%|██████████| 500/500 [04:10<00:00,  1.99it/s]
MMR ngram=(1, 2) nc=6 div=0.3: 100%|██████████| 500/500 [04:18<00:00,  1.94it/s]
MMR ngram=(1, 2) nc=6 div=0.4: 100%|██████████| 500/500 [04:21<00:00,  1.91it/s]
MMR ngram=(1, 2) nc=6 div=0.5: 100%|██████████| 500/500 [03:58<00:00,  2.10it/s]
MMR ngram=(1, 2) nc=6 div=0.6: 100%|██████████| 500/500 [04:02<00:00,  2.06it/s]
MMR ngram=(1, 2) nc=7 div=0.2: 100%|██████████| 500/500 [04:01<00:00,  2.07it/s]
MMR ngram=(1, 2) nc=7 div=0.3: 100%|██████████| 500/500 [04:08<00:00,  2.01it/s]
MMR ngram=(1, 2) nc=7 div=0.

In [12]:
print(f"best_exact_mmr, best_params_exact_mmr,: {best_exact_mmr}, {best_params_exact_mmr}, best_partial_mmr, best_params_partial_mmr: {best_partial_mmr}, {best_params_partial_mmr}")

best_exact_mmr, best_params_exact_mmr,: 0.14620000000000016, ((1, 2), 5, 0.4), best_partial_mmr, best_params_partial_mmr: 0.8235333333333336, ((1, 3), 5, 0.3)


In [7]:
# Ruan MaxSum tuning
best_exact_maxsum, best_params_exact_maxsum, best_partial_maxsum, best_params_partial_maxsum = run_experiment(use_maxsum=True)

MaxSum ngram=(1, 2) nc=5: 100%|██████████| 500/500 [05:14<00:00,  1.59it/s]
MaxSum ngram=(1, 2) nc=6: 100%|██████████| 500/500 [04:37<00:00,  1.80it/s]
MaxSum ngram=(1, 2) nc=7: 100%|██████████| 500/500 [04:34<00:00,  1.82it/s]
MaxSum ngram=(1, 2) nc=8: 100%|██████████| 500/500 [04:20<00:00,  1.92it/s]
MaxSum ngram=(1, 3) nc=5: 100%|██████████| 500/500 [06:22<00:00,  1.31it/s]
MaxSum ngram=(1, 3) nc=6: 100%|██████████| 500/500 [06:47<00:00,  1.23it/s]
MaxSum ngram=(1, 3) nc=7: 100%|██████████| 500/500 [06:59<00:00,  1.19it/s]
MaxSum ngram=(1, 3) nc=8: 100%|██████████| 500/500 [06:22<00:00,  1.31it/s]
MaxSum ngram=(2, 2) nc=5: 100%|██████████| 500/500 [03:02<00:00,  2.74it/s]
MaxSum ngram=(2, 2) nc=6: 100%|██████████| 500/500 [02:47<00:00,  2.98it/s]
MaxSum ngram=(2, 2) nc=7: 100%|██████████| 500/500 [02:45<00:00,  3.03it/s]
MaxSum ngram=(2, 2) nc=8: 100%|██████████| 500/500 [02:57<00:00,  2.82it/s]
MaxSum ngram=(2, 3) nc=5: 100%|██████████| 500/500 [04:38<00:00,  1.79it/s]
MaxSum ngram

In [8]:
print(f"best_exact_maxsum, best_params_exact_maxsum,: {best_exact_maxsum}, {best_params_exact_maxsum}, best_partial_maxsum, best_params_partial_maxsum: {best_partial_maxsum}, {best_params_partial_maxsum}")

best_exact_maxsum, best_params_exact_maxsum,: 0.16536666666666686, ((1, 2), 5), best_partial_maxsum, best_params_partial_maxsum: 0.8241666666666669, ((1, 2), 5)


## SemEval2017 dataset

In [7]:
import os

# read docs and gold keywords   

docs_dir = os.path.join("SemEval2017", "docsutf8")
keys_dir = os.path.join("SemEval2017", "keys")
doc_files = sorted(os.listdir(docs_dir))
key_files = sorted(os.listdir(keys_dir))
docs = []
gold_keywords = [] 
for doc_file, key_file in zip(doc_files, key_files):
    with open(os.path.join(docs_dir, doc_file), encoding='utf-8') as f:
        docs.append(f.read())
    with open(os.path.join(keys_dir, key_file), encoding='utf-8') as f:
        gold_keywords.append([line.strip().lower() for line in f if line.strip()])

In [8]:
docs[3], gold_keywords[3]

('This work shows how our approach based on the combination of Statistical Mechanics and nonlinear PDEs theory provides us with a novel and powerful tool to tackle phase transitions. This method leads to solution of perhaps the most known test-case that exhibits a first order phase transition (semi-heuristically described) such as the van der Waals model. In particular we have obtained the first global mean field partition function (Eq. (9)), for a system of finite number of particles. The partition function is a solution to the Klein–Gordon equation, reproduces the van der Waals isotherms away from the critical region and, in the thermodynamic limit N→∞ automatically encodes the Maxwell equal areas rule. The approach hereby presented is of remarkable simplicity, has been successfully applied to spin\xa0 [17–19,14,16] and macroscopic thermodynamic systems\xa0 [20,15] and can be further extended to include the larger class of models admitting partition functions of the form (4) to be us

In [9]:
# Run MMR tuning
best_exact_mmr, best_params_exact_mmr, best_partial_mmr, best_params_partial_mmr = run_experiment(use_mmr=True)

MMR ngram=(1, 2) nc=5 div=0.2: 100%|██████████| 493/493 [03:17<00:00,  2.50it/s]
MMR ngram=(1, 2) nc=5 div=0.3: 100%|██████████| 493/493 [02:27<00:00,  3.35it/s]
MMR ngram=(1, 2) nc=5 div=0.4: 100%|██████████| 493/493 [02:24<00:00,  3.41it/s]
MMR ngram=(1, 2) nc=5 div=0.5: 100%|██████████| 493/493 [02:25<00:00,  3.38it/s]
MMR ngram=(1, 2) nc=5 div=0.6: 100%|██████████| 493/493 [02:35<00:00,  3.18it/s]
MMR ngram=(1, 2) nc=6 div=0.2: 100%|██████████| 493/493 [02:33<00:00,  3.21it/s]
MMR ngram=(1, 2) nc=6 div=0.3: 100%|██████████| 493/493 [02:53<00:00,  2.85it/s]
MMR ngram=(1, 2) nc=6 div=0.4: 100%|██████████| 493/493 [02:25<00:00,  3.39it/s]
MMR ngram=(1, 2) nc=6 div=0.5: 100%|██████████| 493/493 [02:25<00:00,  3.40it/s]
MMR ngram=(1, 2) nc=6 div=0.6: 100%|██████████| 493/493 [02:29<00:00,  3.31it/s]
MMR ngram=(1, 2) nc=7 div=0.2: 100%|██████████| 493/493 [02:24<00:00,  3.41it/s]
MMR ngram=(1, 2) nc=7 div=0.3: 100%|██████████| 493/493 [02:25<00:00,  3.39it/s]
MMR ngram=(1, 2) nc=7 div=0.

In [12]:
print(f"best_exact_mmr, best_params_exact_mmr,: {best_exact_mmr}, {best_params_exact_mmr}, best_partial_mmr, best_params_partial_mmr: {best_partial_mmr}, {best_params_partial_mmr}")

best_exact_mmr, best_params_exact_mmr,: 0.2406693711967547, ((1, 2), 5, 0.2), best_partial_mmr, best_params_partial_mmr: 0.7376267748478705, ((1, 2), 5, 0.2)


In [10]:
# Run MaxSum tuning
best_exact_maxsum, best_params_exact_maxsum, best_partial_maxsum, best_params_partial_maxsum = run_experiment(use_maxsum=True)

MaxSum ngram=(1, 2) nc=5: 100%|██████████| 493/493 [03:57<00:00,  2.07it/s]
MaxSum ngram=(1, 2) nc=6: 100%|██████████| 493/493 [02:46<00:00,  2.96it/s]
MaxSum ngram=(1, 2) nc=7: 100%|██████████| 493/493 [02:29<00:00,  3.29it/s]
MaxSum ngram=(1, 2) nc=8: 100%|██████████| 493/493 [02:36<00:00,  3.15it/s]
MaxSum ngram=(1, 3) nc=5: 100%|██████████| 493/493 [03:49<00:00,  2.15it/s]
MaxSum ngram=(1, 3) nc=6: 100%|██████████| 493/493 [04:02<00:00,  2.03it/s]
MaxSum ngram=(1, 3) nc=7: 100%|██████████| 493/493 [04:25<00:00,  1.86it/s]
MaxSum ngram=(1, 3) nc=8: 100%|██████████| 493/493 [05:58<00:00,  1.37it/s]
MaxSum ngram=(2, 2) nc=5: 100%|██████████| 493/493 [02:17<00:00,  3.59it/s]
MaxSum ngram=(2, 2) nc=6: 100%|██████████| 493/493 [02:13<00:00,  3.69it/s]
MaxSum ngram=(2, 2) nc=7: 100%|██████████| 493/493 [02:12<00:00,  3.73it/s]
MaxSum ngram=(2, 2) nc=8: 100%|██████████| 493/493 [02:21<00:00,  3.48it/s]
MaxSum ngram=(2, 3) nc=5: 100%|██████████| 493/493 [04:03<00:00,  2.03it/s]
MaxSum ngram

In [13]:
print(f"best_exact_maxsum, best_params_exact_maxsum,: {best_exact_maxsum}, {best_params_exact_maxsum}, best_partial_maxsum, best_params_partial_maxsum: {best_partial_maxsum}, {best_params_partial_maxsum}")

best_exact_maxsum, best_params_exact_maxsum,: 0.2392832995267073, ((1, 2), 5), best_partial_maxsum, best_params_partial_maxsum: 0.7591615956727529, ((1, 2), 5)
