In [1]:
!pip install datasets
from datasets import Dataset



In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Suhas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch

def get_synset(lemma, pos, model, tokenizer, known_synset_embeddings, collect_only=False, unique_synsets=None):
    """
    Map a lemma and POS to its corresponding WordNet synset.
    Use embedding-based similarity fallback if necessary.

    Args:
        lemma (str): Lemma of the word.
        pos (str): Part of speech of the word.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used with the model.
        known_synset_embeddings: Precomputed embeddings for known synsets.
        collect_only (bool): If True, only collect synsets without fallback.
        unique_synsets (set): A set to collect unique synsets.

    Returns:
        str: Synset name or "UNK" if not found.
    """
    pos_map = {"NOUN": wn.NOUN, "VERB": wn.VERB, "ADJ": wn.ADJ, "ADV": wn.ADV}
    wn_pos = pos_map.get(pos)

    # Attempt to retrieve synsets from WordNet
    if wn_pos:
        synsets = wn.synsets(lemma, pos=wn_pos)
        if synsets:
            synset = synsets[0].name()  # First synset
            if collect_only and unique_synsets is not None:
                unique_synsets.add(synset)  # Collect unique synsets
            return synset

    # Skip fallback during collection
    if collect_only:
        return "UNK"

    # Fallback: Find closest synset embedding for OOV words
    try:
        word_embedding = model.get_input_embeddings()(
            torch.tensor([tokenizer.convert_tokens_to_ids(lemma)])
        ).detach().numpy()
        closest_synset = get_closest_synset(word_embedding, known_synset_embeddings)
        return closest_synset
    except Exception as e:
        print(f"Error finding closest synset for OOV word '{lemma}': {e}")
        return "UNK"

def get_closest_synset(embedding, known_synset_embeddings):
    """
    Find the closest synset based on cosine similarity.

    Args:
        embedding (np.ndarray): Embedding for the OOV word.
        known_synset_embeddings (dict): Precomputed embeddings for known synsets.

    Returns:
        str: Synset with the highest similarity.
    """
    similarities = {
        synset: cosine_similarity(embedding.reshape(1, -1), known_embedding.reshape(1, -1))[0][0]
        for synset, known_embedding in known_synset_embeddings.items()
    }
    return max(similarities, key=similarities.get)

def parse_xml_dataset(xml_path, model=None, tokenizer=None, known_synset_embeddings=None, collect_only=False):
    """
    Parse XML dataset, collect unique synsets, and preprocess sentences in one pass.

    Args:
        xml_path (str): Path to the XML file.
        model: Fine-tuned BERT model (optional).
        tokenizer: Tokenizer used with the model (optional).
        known_synset_embeddings: Precomputed embeddings for known synsets (optional).
        collect_only (bool): If True, only collect unique synsets without preprocessing.

    Returns:
        pd.DataFrame: DataFrame containing texts and labels.
        set: Unique synsets collected from the dataset.
    """
    texts, labels = [], []
    unique_synsets = set()

    for event, elem in ET.iterparse(xml_path, events=("start", "end")):
        if event == "end" and elem.tag == "sentence":
            sentence_text = []
            sentence_labels = []

            for child in elem:
                if child.tag == "wf":
                    sentence_text.append(child.text)
                elif child.tag == "instance":
                    sentence_text.append(child.text)
                    synset = get_synset(
                        child.attrib.get("lemma", ""),
                        child.attrib.get("pos", ""),
                        model,
                        tokenizer,
                        known_synset_embeddings,
                        collect_only=collect_only,
                        unique_synsets=unique_synsets
                    )
                    sentence_labels.append({
                        "id": child.attrib.get("id", ""),
                        "lemma": child.attrib.get("lemma", ""),
                        "pos": child.attrib.get("pos", ""),
                        "synset": synset
                    })

            texts.append(" ".join(sentence_text))
            labels.append(sentence_labels)
            elem.clear()  # Free memory

    return pd.DataFrame({"text": texts, "labels": labels}), unique_synsets


In [4]:
xml_path = "./WSD_Training_Corpora/SemCor/semcor.data.xml"

# Step 1: Collect synsets and preprocess dataset
parsed_data, unique_synsets = parse_xml_dataset(xml_path, collect_only=True)

# Step 2: Map synsets to IDs
synset_to_id = {synset: idx for idx, synset in enumerate(unique_synsets)}
id_to_synset = {idx: synset for synset, idx in synset_to_id.items()}


In [5]:
from transformers import BertForTokenClassification, BertTokenizerFast

# Initialize model and tokenizer
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(synset_to_id))
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Precompute embeddings for known synsets
known_synset_embeddings = {
    synset: model.get_input_embeddings()(
        torch.tensor([tokenizer.convert_tokens_to_ids(synset)])
    ).detach().numpy()
    for synset in synset_to_id.keys()
}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Parse dataset with embedding-based fallback
final_data, _ = parse_xml_dataset(xml_path, model, tokenizer, known_synset_embeddings, collect_only=False)


In [7]:
def preprocess_dataset(data_df, tokenizer, synset_to_id, max_length=128):
    """
    Preprocess the dataset by tokenizing the text and aligning labels.

    Args:
        data_df (pd.DataFrame): Parsed dataset with 'text' and 'labels'.
        tokenizer: BERT tokenizer.
        synset_to_id (dict): Mapping of synsets to numeric IDs.
        max_length (int): Maximum sequence length for tokenized inputs.

    Returns:
        List[dict]: Preprocessed data with tokenized inputs and aligned labels.
    """
    tokenized_data = []

    for _, row in data_df.iterrows():
        text = row["text"]
        labels = row["labels"]

        # Tokenize the text
        tokenized_inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        word_ids = tokenized_inputs.word_ids(batch_index=0)

        # Map synsets to numeric IDs
        label_dict = {
            int(label["id"].split(".")[-1].lstrip("t")): synset_to_id.get(label["synset"], -100)
            for label in labels
        }

        # Align labels with tokens
        token_labels = []
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                token_labels.append(-100)
            elif word_id in label_dict:
                token_labels.append(label_dict[word_id])
            else:
                token_labels.append(-100)

        tokenized_inputs["labels"] = token_labels
        tokenized_data.append({
            "input_ids": tokenized_inputs["input_ids"].squeeze().tolist(),
            "attention_mask": tokenized_inputs["attention_mask"].squeeze().tolist(),
            "labels": token_labels
        })

    return tokenized_data


In [8]:
def convert_to_hf_dataset(tokenized_data):
    return Dataset.from_dict({
        "input_ids": [data["input_ids"] for data in tokenized_data],
        "attention_mask": [data["attention_mask"] for data in tokenized_data],
        "labels": [data["labels"] for data in tokenized_data],
    })

# Preprocess the parsed data
tokenized_data = preprocess_dataset(final_data, tokenizer, synset_to_id)

# Convert to Hugging Face Dataset
hf_dataset = convert_to_hf_dataset(tokenized_data)


In [9]:
print(hf_dataset[0])

{'input_ids': [101, 2129, 2146, 2038, 2009, 2042, 2144, 2017, 8182, 1996, 11100, 1997, 2115, 5770, 1998, 2326, 2565, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 9799, 5964, 2381, 17965, 2440, 17885, 7227, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import torch
from transformers import Trainer, TrainingArguments

def compute_metrics(eval_pred):
    """
    Compute evaluation metrics: Precision, Recall, F1-Score, and Perplexity.

    Args:
        eval_pred: A tuple of (logits, labels) from the Trainer evaluation.

    Returns:
        dict: Dictionary of computed metrics.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Mask out padding and special tokens (-100)
    true_labels = labels[labels != -100]
    true_predictions = predictions[labels != -100]

    # Calculate Precision, Recall, and F1-Score
    precision = precision_score(true_labels, true_predictions, average="weighted")
    recall = recall_score(true_labels, true_predictions, average="weighted")
    f1 = f1_score(true_labels, true_predictions, average="weighted")

    # Calculate Perplexity
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    log_probs = np.log(np.max(probs, axis=-1) + 1e-9)  # Adding epsilon for numerical stability
    perplexity = np.exp(-np.mean(log_probs[labels != -100]))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "perplexity": perplexity,
    }

In [12]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-win_amd64.whl.metadata (3.6 kB)
Collecting typing-extensions>=4.8.0 (from torch->bitsandbytes)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading bitsandbytes-0.44.1-py3-none-win_amd64.whl (121.5 MB)
   ---------------------------------------- 0.0/121.5 MB ? eta -:--:--
   -- ------------------------------------- 7.3/121.5 MB 41.2 MB/s eta 0:00:03
   ----- ---------------------------------- 16.3/121.5 MB 40.9 MB/s eta 0:00:03
   ------- -------------------------------- 24.1/121.5 MB 39.1 MB/s eta 0:00:03
   --------- ------------------------------ 27.8/121.5 MB 33.2 MB/s eta 0:00:03
   ----------- ---------------------------- 34.3/121.5 MB 34.6 MB/s eta 0:00:03
   --------------- ------------------------ 45.6/121.5 MB 35.8 MB/s eta 0:00:03
   ----------------- ---------------------- 54.5/121.5 MB 36.2 MB/s eta 0:00:02
   -------------------- ------------------- 63.2/121.5 MB 36.6 MB/s

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyppeteer 2.0.0 requires urllib3<2.0.0,>=1.25.8, but you have urllib3 2.2.3 which is incompatible.
tensorflow-gpu 2.10.1 requires keras<2.11,>=2.10.0, but you have keras 2.13.1 which is incompatible.
tensorflow-gpu 2.10.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.
tensorflow-gpu 2.10.1 requires tensorboard<2.11,>=2.10, but you have tensorboard 2.13.0 which is incompatible.
tensorflow-gpu 2.10.1 requires tensorflow-estimator<2.11,>=2.10.0, but you have tensorflow-estimator 2.13.0 which is incompatible.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible.


In [11]:
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

In [12]:
from transformers import TrainingArguments, Trainer
import torch
from datetime import datetime

# Memory management utility
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Set up GPU for training
torch.backends.cuda.matmul.allow_tf32 = True

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()

# Move model to GPU for training
model.to(torch.device("cuda"))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable automatic evaluation
    learning_rate=5e-5,
    per_device_train_batch_size=8,  # Reduce if memory issues persist
    per_device_eval_batch_size=1,  # Smaller batch size for evaluation
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    optim="adamw_bnb_8bit",  # Optimizer for low-memory scenarios
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    fp16=True,  # Mixed precision training
    load_best_model_at_end=False,  # Disable automatic loading
    eval_accumulation_steps = 50,
)

# Split dataset into train and validation
train_val_split = hf_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Move validation dataset to CPU
val_dataset = val_dataset.map(lambda x: {k: torch.tensor(v).to("cpu") for k, v in x.items()})

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,  # No automatic evaluation during training
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

# Custom training and evaluation loop
# best_model_path = None
best_metric = float("inf")  # Assuming lower metric is better (e.g., loss)

# for epoch in range(int(training_args.num_train_epochs)):
#     print(f"Starting Epoch {epoch + 1}/{training_args.num_train_epochs}")
    
    # Train on GPU
model.to(torch.device("cuda"))
clear_memory()
# trainer.train()
trainer.train(resume_from_checkpoint="./results/checkpoint-3135")
    
    # Evaluate on CPU
    # print(f"Evaluating after Epoch {epoch + 1}")
    # model.to(torch.device("cpu"))
    # clear_memory()
    # trainer.eval_dataset = val_dataset  # Update evaluation dataset
    # metrics = trainer.evaluate()
    # print(f"Metrics for Epoch {epoch + 1}: {metrics}")
    
    # Save the best model
    # current_metric = metrics["eval_loss"]  # Replace with your preferred metric
    # if current_metric < best_metric:
    #     best_metric = current_metric

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
best_model_path = f"./best_model_{current_timestamp}"
print(f"New best model found! Saving to {best_model_path}")
trainer.save_model(best_model_path)

# Final best model path
print(f"Best model saved at: {best_model_path}")




Map:   0%|          | 0/3718 [00:00<?, ? examples/s]

  trainer = Trainer(
You are resuming training from a checkpoint trained with 4.46.3 of Transformers but your current version is 4.46.2. This is not recommended and could yield to errors or unwanted behaviors.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Step,Training Loss


New best model found! Saving to ./best_model_20241127_222422
Best model saved at: ./best_model_20241127_222422


In [19]:
# eval_results = trainer.evaluate(val_dataset)
# print(f"Evaluation Results: {eval_results}")

clear_memory()

training_args.per_device_eval_batch_size = 1
training_args.eval_accumulation_steps = 10
torch.cuda.empty_cache()  # Clear memory

# Use only a subset of the validation dataset
val_dataset_small = val_dataset.select(range(1000))  # First 1000 samples
print("val_dataset",val_dataset)
print("val_dataset_small",val_dataset_small)

eval_results = trainer.evaluate(val_dataset_small)
print(f"Evaluation Results: {eval_results}")


# eval_results = trainer.evaluate(val_dataset_small)
# print(f"Evaluation Results: {eval_results}")



val_dataset Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3718
})
val_dataset_small Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})


KeyboardInterrupt: 

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

# Updated function for processing and mapping synsets with OOV handling
def process_corpus_sections_incrementally_with_synsets(
    xml_path, model, tokenizer, known_synset_embeddings, chunk_size=10000
):
    """
    Process large XML datasets incrementally with OOV handling for synset mapping.

    Args:
        xml_path (str): Path to the XML file.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used with the model.
        known_synset_embeddings (dict): Precomputed embeddings for known synsets.
        chunk_size (int): Number of sentences per chunk.

    Returns:
        List[pd.DataFrame]: List of DataFrames, one per processed chunk.
    """
    print(f"Processing <corpus> sections from: {xml_path}")

    corpus_dfs = []  # Store data frames for each <corpus> section
    texts, labels = [], []
    inside_corpus = False
    inside_text = False
    corpus_count = 0
    chunk_counter = 0

    # Open the file and read it line-by-line
    with open(xml_path, "r") as file:
        for line in file:
            # Check for the start of a <corpus> section
            if "<corpus" in line:
                inside_corpus = True
                texts, labels = [], []  # Reset lists for a new <corpus>
                corpus_count += 1
                print(f"\nProcessing <corpus> section {corpus_count}...")

            # Process each line only if we're inside a <corpus> section
            if inside_corpus:
                if "<text" in line:
                    inside_text = True  # Start of a new <text> element
                    text_buffer = [line]  # Reset the buffer

                elif inside_text:
                    text_buffer.append(line)  # Accumulate lines within <text>

                    if "</text>" in line:  # End of <text> element
                        inside_text = False
                        # Parse the accumulated <text> element
                        text_xml = "".join(text_buffer)
                        text_elem = ET.fromstring("<root>" + text_xml + "</root>")  # Wrap for valid XML

                        # Process each sentence in the <text> element
                        for sentence in text_elem.findall(".//sentence"):
                            sentence_text = []
                            sentence_labels = []

                            # Extract words from <wf> and <instance> elements
                            for word_elem in sentence:
                                if word_elem.tag == "wf":
                                    sentence_text.append(word_elem.text)
                                elif word_elem.tag == "instance":
                                    sentence_text.append(word_elem.text)
                                    # Reuse get_synset with OOV handling
                                    synset = get_synset(
                                        word_elem.attrib.get("lemma", ""),
                                        word_elem.attrib.get("pos", ""),
                                        model,
                                        tokenizer,
                                        known_synset_embeddings
                                    )
                                    sentence_labels.append({
                                        "id": word_elem.attrib.get("id", ""),
                                        "lemma": word_elem.attrib.get("lemma", ""),
                                        "pos": word_elem.attrib.get("pos", ""),
                                        "synset": synset
                                    })

                            # Append extracted sentence data to texts and labels
                            if sentence_text:
                                texts.append(" ".join(sentence_text))
                                labels.append(sentence_labels)

                        # Check if we've reached the chunk size limit
                        if len(texts) >= chunk_size:
                            # Save the chunk to a DataFrame and clear memory
                            corpus_df = pd.DataFrame({"text": texts, "labels": labels})
                            corpus_dfs.append(corpus_df)
                            print(f"Processed {chunk_counter + 1} chunks of {chunk_size} records.")
                            chunk_counter += 1
                            texts, labels = [], []  # Reset lists for the next chunk

            # Check for the end of a <corpus> section
            if "</corpus>" in line and inside_corpus:
                inside_corpus = False
                # Save any remaining data after the last chunk
                if texts and labels:
                    corpus_df = pd.DataFrame({"text": texts, "labels": labels})
                    corpus_dfs.append(corpus_df)
                    print(f"Final chunk for <corpus> section {corpus_count}.")
                    texts, labels = [], []  # Correctly reset lists for the next corpus section

    return corpus_dfs


In [None]:
xml_path = "/content/data/semcor+omsti.data.xml"
test_corpus_dfs = process_corpus_sections_incrementally_with_synsets(
    xml_path, model, tokenizer, known_synset_embeddings, chunk_size=10000
)

# Combine all chunks into a single DataFrame (if memory permits)
combined_test_df = pd.concat(test_corpus_dfs, ignore_index=True)
print("Test dataset preview:")
print(combined_test_df.head())


Processing <corpus> sections from: /content/data/semcor+omsti.data.xml

Processing <corpus> section 1...
Processed 1 chunks of 10000 records.
Processed 2 chunks of 10000 records.
Processed 3 chunks of 10000 records.
Final chunk for <corpus> section 1.

Processing <corpus> section 2...
Test dataset preview:
                                                text  \
0  How long has it been since you reviewed the ob...   
1  Have you permitted it to become a giveaway pro...   
2  What effort do you make to assess results of y...   
3  Do you measure its relation to reduced absente...   
4  Have you set specific objectives for your empl...   

                                              labels  
0  [{'id': 'd000.s000.t000', 'lemma': 'long', 'po...  
1  [{'id': 'd000.s001.t000', 'lemma': 'permit', '...  
2  [{'id': 'd000.s002.t000', 'lemma': 'effort', '...  
3  [{'id': 'd000.s003.t000', 'lemma': 'measure', ...  
4  [{'id': 'd000.s004.t000', 'lemma': 'set', 'pos...  


In [None]:
tokenized_test_data = preprocess_dataset(combined_test_df, tokenizer, synset_to_id)
hf_test_dataset = convert_to_hf_dataset(tokenized_test_data)


In [None]:
print("Preprocessed Testing Dataset Sample:")
print(hf_test_dataset[0])

Preprocessed Testing Dataset Sample:
{'input_ids': [101, 2129, 2146, 2038, 2009, 2042, 2144, 2017, 8182, 1996, 11100, 1997, 2115, 5770, 1998, 2326, 2565, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 17472, 6776, 1142, 15255, 18262, 18629, 2944, -100, -100, -100, -100, -100, -100, -100

In [None]:
fine_tuned_model = BertForTokenClassification.from_pretrained("./results")
test_results = trainer.evaluate(test_hf_dataset)
print(f"Test Results: {test_results}")

INFERENCE

In [None]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

def predict_synsets(sentence, model, tokenizer, id_to_synset, max_length=128):
    """
    Predict WordNet synsets for each word in a given sentence using the fine-tuned model.

    Args:
        sentence (str): Input sentence.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used for the model.
        id_to_synset (dict): Mapping from numeric label IDs to WordNet synsets.
        max_length (int): Maximum length for tokenized sequences.

    Returns:
        List[Tuple[str, str]]: List of (word, synset) tuples.
    """
    # Tokenize the input sentence
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )

    # Get the word IDs for alignment
    word_ids = inputs.word_ids(batch_index=0)

    # Perform inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predictions (logits -> argmax)
    logits = outputs.logits
    predictions = logits.argmax(dim=-1).squeeze().tolist()

    # Map predictions to WordNet synsets
    tokens = tokenizer.tokenize(sentence)
    predicted_synsets = []
    for word_id, pred in zip(word_ids, predictions):
        if word_id is not None and pred in id_to_synset:
            predicted_synsets.append(id_to_synset[pred])
        else:
            predicted_synsets.append("UNK")  # Unknown or ignored token

    # Align tokens with synsets
    result = []
    for token, synset in zip(tokens, predicted_synsets):
        result.append((token, synset))

    return result


In [None]:
# Prompt user for input
sentence = input("Enter a sentence: Everything happens for its own good.")

# Predict synsets
predictions = predict_synsets(sentence, model, tokenizer, id_to_synset)

# Display results
print("\nPredicted Synsets:")
for token, synset in predictions:
    print(f"{token}: {synset}")
