In [1]:
!pip install datasets wordfreq

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

#  Lexical Substitution Generation (SG) - Project Description

This notebook implements **Substitution Generation (SG)** for the MLSP 2024 LS test dataset.  
The goal is to suggest simpler alternatives for complex words while preserving the original meaning.

Two approaches are developed:

- **BERT Masked Language Model (MLM):**
  - The target word is replaced with a [MASK] token.
  - Predictions are generated using a pre-trained `bert-base-uncased` model.
  - Top-5 substitution candidates are collected for each instance.

- **Zipf Frequency-based Dictionary Approach:**
  - Substitutions are extracted from DictionaryAPI.dev, Wiktionary, and WordNet.
  - Candidates are filtered based on Part-of-Speech (POS) and ranked by **Zipf frequency** (word simplicity measure).
  - Simpler, high-frequency words are prioritized.

**Evaluation Metrics:**
- **Potential**: % of examples with at least one correct substitution.
- **Precision / Recall / F1-score**: Comparing predicted candidates to gold-standard substitutions.
- **Potential@1**: Whether the top-1 substitution matches a gold candidate.

Both methods are compared based on quantitative metrics and qualitative error analysis.

---



In [2]:
# ✅ Imports
import nltk
import pandas as pd
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from wordfreq import zipf_frequency

In [3]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `zaid1609` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authentic


## **Lexical Substitution using BERT MLM (Masked Language Modeling)**

This script loads the MLSP2024 test set, selects complex words, and uses
BERT (bert-base-uncased) to generate top-5 context-aware substitute suggestions
via fill-mask prediction. Outputs the target word, its sentence, and substitutes.

In [4]:
# ✅ Imports
import nltk
import pandas as pd
from nltk.corpus import wordnet as wn
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset
from wordfreq import zipf_frequency

# ✅ Downloads
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# ✅ Load MLSP2024 Lexical Substitution test set
mlsp = load_dataset("MLSP2024/MLSP2024", "english_ls_labels", split="test")

# ✅ Convert to DataFrame
df_gold = pd.DataFrame(mlsp)

# ✅ Extract gold substitutions
sub_cols = [col for col in df_gold.columns if col.startswith("substitution_")]
df_gold["substitutions"] = df_gold[sub_cols].values.tolist()
df_gold["substitutions"] = df_gold["substitutions"].apply(
    lambda subs: list({str(s).strip().lower() for s in subs if s and s != "None"})
)
gold_data = df_gold["substitutions"].apply(lambda subs: {"substitutions": subs}).tolist()

# ✅ MLM pipeline setup
mlm_pipeline = pipeline("fill-mask", model="bert-base-uncased", top_k=5)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# ✅ Substitution generator
def get_bert_predictions(context, target):
    masked_sentence = context.replace(target, "[MASK]")
    if "[MASK]" not in masked_sentence:
        return []
    try:
        results = mlm_pipeline(masked_sentence)
        return [res["token_str"] for res in results]
    except:
        return []

# ✅ Apply model to all instances
results = []
for ex in mlsp:
    word = ex["target"]
    sentence = ex["context"]
    bert_subs = get_bert_predictions(sentence, word)
    results.append({
        "original": word,
        "context": sentence,
        "bert_subs": bert_subs
    })

# ✅ Convert to DataFrame
df_results = pd.DataFrame(results)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/50.2k [00:00<?, ?B/s]

multils_test_english_ls_labels.tsv:   0%|          | 0.00/129k [00:00<?, ?B/s]

multils_trial_english_ls_labels.tsv:   0%|          | 0.00/6.28k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating trial split: 0 examples [00:00, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


In [5]:
df_bert_subs = pd.DataFrame(df_results)
df_bert_subs.to_csv("mlsp2024_bert_substitutions.csv", index=False)
df_bert_subs.head(10)


Unnamed: 0,original,context,bert_subs
0,distraught,After Ron nearly dies drinking poisoned mead t...,"[angry, enraged, upset, furious, jealous]"
1,drinking,After Ron nearly dies drinking poisoned mead t...,"[from, of, by, using, with]"
2,oratory,"After the war, Hitler remained in the army and...","[military, combat, practical, police, political]"
3,reporting,"After the war, Hitler remained in the army and...","[reporting, reported, reports, writing, report]"
4,infiltrating,"After the war, Hitler remained in the army and...","[monitoring, investigating, identifying, advis..."
5,relative,"All other things being equal, nucleophiles are...","[their, chemical, the, relative, nuclear]"
6,nucleophiles,"All other things being equal, nucleophiles are...","[they, these, we, and, metals]"
7,compared,"All other things being equal, nucleophiles are...","[related, relative, equal, similar, compared]"
8,frictional,"Also, the frictional coefficient varies greatl...","[diffusion, friction, contact, absorption, sep..."
9,smoothness,"Also, the frictional coefficient varies greatl...","[pressure, pressures, density, humidity, frict..."


In [6]:
df_bert_subs

Unnamed: 0,original,context,bert_subs
0,distraught,After Ron nearly dies drinking poisoned mead t...,"[angry, enraged, upset, furious, jealous]"
1,drinking,After Ron nearly dies drinking poisoned mead t...,"[from, of, by, using, with]"
2,oratory,"After the war, Hitler remained in the army and...","[military, combat, practical, police, political]"
3,reporting,"After the war, Hitler remained in the army and...","[reporting, reported, reports, writing, report]"
4,infiltrating,"After the war, Hitler remained in the army and...","[monitoring, investigating, identifying, advis..."
...,...,...,...
565,drafting,Your legal agreements should go through many i...,"[legal, negotiation, translation, drafting, wr..."
566,verb,because the specific conjugation of a verb usu...,"[verb, sentence, word, noun, subject]"
567,conjugation,because the specific conjugation of a verb usu...,"[form, tense, position, aspect, subject]"
568,indicates,because the specific conjugation of a verb usu...,"[determines, indicates, specifies, decides, de..."


In [7]:
from wordfreq import zipf_frequency

# ✅ Simplification success check (Zipf frequency)
def is_simpler(original, substitute):
    return zipf_frequency(substitute, "en") > zipf_frequency(original, "en")

df_results["top1"] = df_results["bert_subs"].apply(lambda x: x[0] if x else "")
df_results["is_top1_simpler"] = df_results.apply(
    lambda row: is_simpler(row["original"], row["top1"]) if row["top1"] else False,
    axis=1
)


In [8]:
# ✅ Print simplification success rate
success_rate = df_results["is_top1_simpler"].mean()
print(f"🔍 Simplification Success Rate (Top-1): {success_rate:.2%}")


🔍 Simplification Success Rate (Top-1): 69.12%


In [9]:
# ✅ Show failed substitutions
failed = df_results[~df_results["is_top1_simpler"]]
print("❌ Failed Simplifications:")
print(failed[["original", "context", "top1"]].head(10))


❌ Failed Simplifications:
        original                                            context  \
3      reporting  After the war, Hitler remained in the army and...   
10        varies  Also, the frictional coefficient varies greatl...   
14         holes  Although some frames are provided with three h...   
19       talking  Although the Jews were the favored targets and...   
21  technologies  Among these technologies was nuclear fission, ...   
22       nuclear  Among these technologies was nuclear fission, ...   
28        handle  Another separate book covers Cascading Style S...   
29        father  As Governor, he followed such a strict policy ...   
32     according  As adjectives, their endings will vary accordi...   
37      focusing  As the single parabolic reflector achieves a g...   

           top1  
3     reporting  
10       varies  
14               
19      talking  
21  experiments  
22      nuclear  
28      improve  
29       father  
32    according  
37     focus

In [10]:
failed.to_csv("failed_simplifications.csv", index=False)

In [11]:
# ✅ Evaluation metrics using gold substitutions
def normalize(word):
    return word.replace("##", "").strip().lower()

total_gold = 0
total_pred = 0
correct = 0
potential_hits = 0

for i, row in df_results.iterrows():
    predicted = {normalize(w) for w in row["bert_subs"]}
    gold = {normalize(w) for w in gold_data[i]["substitutions"]}

    if not predicted or not gold:
        continue

    total_gold += len(gold)
    total_pred += len(predicted)
    correct += len(predicted & gold)

    if predicted & gold:
        potential_hits += 1

# ✅ Compute metrics
potential = potential_hits / len(df_results)
precision = correct / total_pred if total_pred > 0 else 0
recall = correct / total_gold if total_gold > 0 else 0
f_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# ✅ Print evaluation metrics
print(f"📊 Potential: {potential:.2%}")
print(f"📈 Precision: {precision:.2%}")
print(f"📉 Recall: {recall:.2%}")
print(f"⭐ F-score: {f_score:.2%}")


📊 Potential: 42.81%
📈 Precision: 13.36%
📉 Recall: 12.42%
⭐ F-score: 12.87%


In [12]:
# ✅ Calculate Potential@1
def normalize(word):
    return word.replace("##", "").strip().lower()

top1_matches = 0
for i, row in df_results.iterrows():
    top1 = normalize(row["top1"])
    gold = {normalize(w) for w in gold_data[i]["substitutions"]}

    if top1 and top1 in gold:
        top1_matches += 1

potential_at_1 = top1_matches / len(df_results)
print(f"📊 Potential@1: {potential_at_1:.2%}")


📊 Potential@1: 18.77%


Lexical Substitution Pipeline using Sentence-BERT and multi-source candidate generation.
This script identifies complex words in context, generates candidate substitutes from
DictionaryAPI, Wiktionary, and WordNet, and ranks them using semantic similarity
from Sentence-BERT (all-MiniLM-L6-v2). Outputs ranked substitutions for each target word.

In [13]:
"""
🔄 Lexical Substitution using Zipf Frequency Ranking
For MLSP2024 LS Dataset
Generates simpler substitutes using DictionaryAPI, Wiktionary, and WordNet,
ranked by word frequency (Zipf scale), and evaluates them using gold labels.
"""

# ✅ Imports
import requests
import nltk
import pandas as pd
import re
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from wordfreq import zipf_frequency
from datasets import load_dataset

# ✅ Download NLP resources
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download('punkt_tab')
nltk.download("averaged_perceptron_tagger_eng")

# ✅ POS tag conversion
def nltk_to_wordnet_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    return None

# ✅ DictionaryAPI.dev
def get_dictapi_candidates(word, target_pos):
    try:
        url = f"https://api.dictionaryapi.dev/api/v2/entries/en/{word}"
        response = requests.get(url)
        if response.status_code != 200:
            return []
        data = response.json()
        candidates = set()
        for meaning in data[0].get("meanings", []):
            for d in meaning.get("definitions", []):
                definition = d.get("definition", "")
                for token, tag in pos_tag(word_tokenize(definition)):
                    t = token.lower()
                    wn_pos = nltk_to_wordnet_pos(tag)
                    if wn_pos == target_pos and t.isalpha() and zipf_frequency(t, "en") > 2.0 and t != word:
                        candidates.add(t)
        return list(candidates)
    except:
        return []

# ✅ Wiktionary fallback
def get_wiktionary_candidates(word, target_pos):
    try:
        url = "https://en.wiktionary.org/w/api.php"
        params = {
            "action": "query",
            "format": "json",
            "titles": word,
            "prop": "extracts",
            "explaintext": 1,
            "redirects": 1
        }
        response = requests.get(url, params=params)
        data = response.json()
        page = next(iter(data["query"]["pages"].values()))
        if "extract" not in page:
            return []
        text = page["extract"]
        candidates = set()
        for line in text.split("\n"):
            if len(line.strip()) > 10:
                for token, tag in pos_tag(word_tokenize(line)):
                    t = token.lower()
                    wn_pos = nltk_to_wordnet_pos(tag)
                    if wn_pos == target_pos and t.isalpha() and zipf_frequency(t, "en") > 2.0 and t != word:
                        candidates.add(t)
        return list(candidates)
    except:
        return []

# ✅ WordNet fallback
def get_wordnet_candidates(word, target_pos):
    candidates = set()
    for syn in wordnet.synsets(word, pos=target_pos):
        for lemma in syn.lemmas():
            w = lemma.name().replace("_", " ").lower()
            if w != word and w.isalpha() and zipf_frequency(w, "en") > 2.0:
                candidates.add(w)
    return list(candidates)

# ✅ Rank candidates by Zipf frequency
def rank_candidates_by_zipf(candidates):
    return sorted(candidates, key=lambda w: zipf_frequency(w, "en"), reverse=True)

# ✅ Load MLSP LS test dataset
mlsp_test = load_dataset("MLSP2024/MLSP2024", "english_ls_labels", split="test")
df = pd.DataFrame(mlsp_test).dropna(subset=["context", "target"])

# ✅ Extract gold substitutions
sub_cols = [col for col in df.columns if col.startswith("substitution_")]
df["gold_subs"] = df[sub_cols].values.tolist()
df["gold_subs"] = df["gold_subs"].apply(lambda subs: list({str(s).strip().lower() for s in subs if s and s != "None"}))
gold_data = df["gold_subs"].apply(lambda subs: {"substitutions": subs}).tolist()

# ✅ Generate substitutions
results = []

for idx, row in df.iterrows():
    target = row["target"]
    context = row["context"]

    # Get POS from context
    tokens = word_tokenize(context)
    tagged = pos_tag(tokens)
    pos_tag_for_target = next((tag for word, tag in tagged if word.lower() == target.lower()), None)
    wn_pos = nltk_to_wordnet_pos(pos_tag_for_target) if pos_tag_for_target else None

    candidates = get_dictapi_candidates(target, wn_pos)
    if not candidates:
        candidates = get_wiktionary_candidates(target, wn_pos)
    if not candidates:
        candidates = get_wordnet_candidates(target, wn_pos)

    ranked = rank_candidates_by_zipf(candidates) if candidates else []

    results.append({
        "target": target,
        "context": context,
        "substitutions": ranked
    })

# ✅ Convert to DataFrame
subs_df = pd.DataFrame(results)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [14]:


# ✅ Evaluation
def normalize(word):
    return word.replace("##", "").strip().lower()

total_gold = 0
total_pred = 0
correct = 0
potential_hits = 0
top1_hits = 0

for i, row in subs_df.iterrows():
    predicted = {normalize(w) for w in row["substitutions"]}
    gold = {normalize(w) for w in gold_data[i]["substitutions"]}

    if not predicted or not gold:
        continue

    total_gold += len(gold)
    total_pred += len(predicted)
    correct += len(predicted & gold)

    if predicted & gold:
        potential_hits += 1

    # ✅ Potential@1: Check if top-1 exists and matches
    top1 = normalize(row["substitutions"][0]) if row["substitutions"] else ""
    if top1 and top1 in gold:
        top1_hits += 1

# ✅ Compute metrics
potential = potential_hits / len(subs_df)
precision = correct / total_pred if total_pred > 0 else 0
recall = correct / total_gold if total_gold > 0 else 0
f_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
potential_at_1 = top1_hits / len(subs_df)

# ✅ Output
print(f"📊 Potential: {potential:.2%}")
print(f"📈 Precision: {precision:.2%}")
print(f"📉 Recall: {recall:.2%}")
print(f"⭐ F-score: {f_score:.2%}")
print(f"🎯 Potential@1: {potential_at_1:.2%}")


📊 Potential: 29.82%
📈 Precision: 5.96%
📉 Recall: 9.72%
⭐ F-score: 7.39%
🎯 Potential@1: 12.98%


In [15]:
# ✅ Identify failed examples
failed_rows = []

for i, row in subs_df.iterrows():
    predicted = {normalize(w) for w in row["substitutions"]}
    gold = {normalize(w) for w in gold_data[i]["substitutions"]}

    if not predicted or not gold:
        continue

    # If there is no overlap between predicted and gold
    if len(predicted & gold) == 0:
        failed_rows.append({
            "target": row["target"],
            "context": row["context"],
            "top1_prediction": row["substitutions"][0] if row["substitutions"] else "",
            "gold_substitutions": list(gold)
        })

# ✅ Save failed examples to CSV
failed_df = pd.DataFrame(failed_rows)
failed_df.to_csv("zipf_failed_substitutions.csv", index=False)

print("✅ Failed examples saved to 'zipf_failed_substitutions.csv'")
print(failed_df.head(10))  # Show top 10 failed cases


✅ Failed examples saved to 'zipf_failed_substitutions.csv'
         target                                            context  \
0      drinking  After Ron nearly dies drinking poisoned mead t...   
1       oratory  After the war, Hitler remained in the army and...   
2     reporting  After the war, Hitler remained in the army and...   
3  infiltrating  After the war, Hitler remained in the army and...   
4      relative  All other things being equal, nucleophiles are...   
5  nucleophiles  All other things being equal, nucleophiles are...   
6      compared  All other things being equal, nucleophiles are...   
7    frictional  Also, the frictional coefficient varies greatl...   
8    smoothness  Also, the frictional coefficient varies greatl...   
9        varies  Also, the frictional coefficient varies greatl...   

  top1_prediction                                 gold_substitutions  
0             act           [intaking, ingesting, taking, consuming]  
1           large           