In [1]:
# %pip install datasets
# %pip install transformers
# %pip install torch torchvision
# %pip install bitsandbytes
# %pip install transformers[torch]
# %pip install 'accelerate>={ACCELERATE_MIN_VERSION}'
# %pip install transformers torch accelerate
# %pip install -U accelerate
# %pip install --upgrade pip
# %pip uninstall transformers accelerate -y
# %pip install transformers[torch] accelerate
# %pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
# %pip install --upgrade accelerate
# %pip uninstall transformers accelerate torch torchvision -y
# %pip install transformers[torch] torch accelerate
# %pip install transformers==4.26.0 accelerate==0.26.0 torch==1.12.0 torchvision
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install torch torchvision torchaudio
# %pip install transformers
# %pip uninstall torch torchvision torchaudio -y
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [2]:
from transformers import BertForTokenClassification, BertTokenizerFast

In [3]:
from datasets import Dataset

In [4]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/svaidya4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/svaidya4/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch

def get_synset(lemma, pos, model, tokenizer, known_synset_embeddings, collect_only=False, unique_synsets=None):
    """
    Map a lemma and POS to its corresponding WordNet synset.
    Use embedding-based similarity fallback if necessary.

    Args:
        lemma (str): Lemma of the word.
        pos (str): Part of speech of the word.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used with the model.
        known_synset_embeddings: Precomputed embeddings for known synsets.
        collect_only (bool): If True, only collect synsets without fallback.
        unique_synsets (set): A set to collect unique synsets.

    Returns:
        str: Synset name or "UNK" if not found.
    """
    pos_map = {"NOUN": wn.NOUN, "VERB": wn.VERB, "ADJ": wn.ADJ, "ADV": wn.ADV}
    wn_pos = pos_map.get(pos)

    # Attempt to retrieve synsets from WordNet
    if wn_pos:
        synsets = wn.synsets(lemma, pos=wn_pos)
        if synsets:
            synset = synsets[0].name()  # First synset
            if collect_only and unique_synsets is not None:
                unique_synsets.add(synset)  # Collect unique synsets
            return synset

    # Skip fallback during collection
    if collect_only:
        return "UNK"

    # Fallback: Find closest synset embedding for OOV words
    try:
        word_embedding = model.get_input_embeddings()(
            torch.tensor([tokenizer.convert_tokens_to_ids(lemma)])
        ).detach().numpy()
        closest_synset = get_closest_synset(word_embedding, known_synset_embeddings)
        return closest_synset
    except Exception as e:
        print(f"Error finding closest synset for OOV word '{lemma}': {e}")
        return "UNK"

def get_closest_synset(embedding, known_synset_embeddings):
    """
    Find the closest synset based on cosine similarity.

    Args:
        embedding (np.ndarray): Embedding for the OOV word.
        known_synset_embeddings (dict): Precomputed embeddings for known synsets.

    Returns:
        str: Synset with the highest similarity.
    """
    similarities = {
        synset: cosine_similarity(embedding.reshape(1, -1), known_embedding.reshape(1, -1))[0][0]
        for synset, known_embedding in known_synset_embeddings.items()
    }
    return max(similarities, key=similarities.get)

def parse_xml_dataset(xml_path, model=None, tokenizer=None, known_synset_embeddings=None, collect_only=False):
    """
    Parse XML dataset, collect unique synsets, and preprocess sentences in one pass.

    Args:
        xml_path (str): Path to the XML file.
        model: Fine-tuned BERT model (optional).
        tokenizer: Tokenizer used with the model (optional).
        known_synset_embeddings: Precomputed embeddings for known synsets (optional).
        collect_only (bool): If True, only collect unique synsets without preprocessing.

    Returns:
        pd.DataFrame: DataFrame containing texts and labels.
        set: Unique synsets collected from the dataset.
    """
    texts, labels = [], []
    unique_synsets = set()

    for event, elem in ET.iterparse(xml_path, events=("start", "end")):
        if event == "end" and elem.tag == "sentence":
            sentence_text = []
            sentence_labels = []

            for child in elem:
                if child.tag == "wf":
                    sentence_text.append(child.text)
                elif child.tag == "instance":
                    sentence_text.append(child.text)
                    synset = get_synset(
                        child.attrib.get("lemma", ""),
                        child.attrib.get("pos", ""),
                        model,
                        tokenizer,
                        known_synset_embeddings,
                        collect_only=collect_only,
                        unique_synsets=unique_synsets
                    )
                    sentence_labels.append({
                        "id": child.attrib.get("id", ""),
                        "lemma": child.attrib.get("lemma", ""),
                        "pos": child.attrib.get("pos", ""),
                        "synset": synset
                    })

            texts.append(" ".join(sentence_text))
            labels.append(sentence_labels)
            elem.clear()  # Free memory

    return pd.DataFrame({"text": texts, "labels": labels}), unique_synsets


In [6]:
xml_path = "./WSD_Training_Corpora/SemCor/semcor.data.xml"

# Step 1: Collect synsets and preprocess dataset
parsed_data, unique_synsets = parse_xml_dataset(xml_path, collect_only=True)


# Step 2: Map synsets to IDs
synset_to_id = {synset: idx for idx, synset in enumerate(unique_synsets)}
id_to_synset = {idx: synset for synset, idx in synset_to_id.items()}


In [7]:
from transformers import BertForTokenClassification, BertTokenizerFast, AutoModel

# Initialize model and tokenizer
model = BertForTokenClassification.from_pretrained("bert-large-uncased", num_labels=len(synset_to_id))
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")

# Load model directly
model = AutoModel.from_pretrained("kanishka/GlossBERT")

# Precompute embeddings for known synsets
known_synset_embeddings = {
    synset: model.get_input_embeddings()(
        torch.tensor([tokenizer.convert_tokens_to_ids(synset)])
    ).detach().numpy()
    for synset in synset_to_id.keys()
}


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized

In [8]:
# Parse dataset with embedding-based fallback
final_data, _ = parse_xml_dataset(xml_path, model, tokenizer, known_synset_embeddings, collect_only=False)


In [9]:
def preprocess_dataset(data_df, tokenizer, synset_to_id, max_length=128):
    """
    Preprocess the dataset by tokenizing the text and aligning labels.

    Args:
        data_df (pd.DataFrame): Parsed dataset with 'text' and 'labels'.
        tokenizer: BERT tokenizer.
        synset_to_id (dict): Mapping of synsets to numeric IDs.
        max_length (int): Maximum sequence length for tokenized inputs.

    Returns:
        List[dict]: Preprocessed data with tokenized inputs and aligned labels.
    """
    tokenized_data = []

    for _, row in data_df.iterrows():
        text = row["text"]
        labels = row["labels"]

        # Tokenize the text
        tokenized_inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        word_ids = tokenized_inputs.word_ids(batch_index=0)

        # Map synsets to numeric IDs
        label_dict = {
            int(label["id"].split(".")[-1].lstrip("t")): synset_to_id.get(label["synset"], -100)
            for label in labels
        }

        # Align labels with tokens
        token_labels = []
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                token_labels.append(-100)
            elif word_id in label_dict:
                token_labels.append(label_dict[word_id])
            else:
                token_labels.append(-100)

        tokenized_inputs["labels"] = token_labels
        tokenized_data.append({
            "input_ids": tokenized_inputs["input_ids"].squeeze().tolist(),
            "attention_mask": tokenized_inputs["attention_mask"].squeeze().tolist(),
            "labels": token_labels
        })

    return tokenized_data


In [10]:
def convert_to_hf_dataset(tokenized_data):
    return Dataset.from_dict({
        "input_ids": [data["input_ids"] for data in tokenized_data],
        "attention_mask": [data["attention_mask"] for data in tokenized_data],
        "labels": [data["labels"] for data in tokenized_data],
    })

# Preprocess the parsed data
tokenized_data = preprocess_dataset(final_data, tokenizer, synset_to_id)

# Convert to Hugging Face Dataset
hf_dataset = convert_to_hf_dataset(tokenized_data)


In [11]:
print(hf_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 37176
})


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import torch
from transformers import Trainer, TrainingArguments

# def compute_metrics(eval_pred):
#     """
#     Compute evaluation metrics: Precision, Recall, F1-Score, and Perplexity.

#     Args:
#         eval_pred: A tuple of (logits, labels) from the Trainer evaluation.

#     Returns:
#         dict: Dictionary of computed metrics.
#     """
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)

#     # Mask out padding and special tokens (-100)
#     true_labels = labels[labels != -100]
#     true_predictions = predictions[labels != -100]

#     # Calculate Precision, Recall, and F1-Score
#     precision = precision_score(true_labels, true_predictions, average="weighted")
#     recall = recall_score(true_labels, true_predictions, average="weighted")
#     f1 = f1_score(true_labels, true_predictions, average="weighted")

#     # Calculate Perplexity
#     probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
#     log_probs = np.log(np.max(probs, axis=-1) + 1e-9)  # Adding epsilon for numerical stability
#     perplexity = np.exp(-np.mean(log_probs[labels != -100]))

#     return {
#         "precision": precision,
#         "recall": recall,
#         "f1": f1,
#         "perplexity": perplexity,
#     }

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    import numpy as np
    import torch

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Mask ignored labels
    labels = np.array(labels)
    valid_labels = labels != -100
    true_labels = labels[valid_labels]
    true_predictions = predictions[valid_labels]

    # Calculate Metrics
    accuracy = accuracy_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions, average="weighted", zero_division=0)
    recall = recall_score(true_labels, true_predictions, average="weighted", zero_division=0)
    f1 = f1_score(true_labels, true_predictions, average="weighted")

    # Calculate Perplexity
    logits_tensor = torch.tensor(logits).float()  # Ensure logits are float32
    with torch.cuda.amp.autocast(enabled=False):  # Disable mixed precision for softmax
        probs = torch.softmax(logits_tensor, dim=-1).numpy()
    log_probs = np.log(np.max(probs, axis=-1) + 1e-9)  # Adding epsilon for numerical stability
    perplexity = np.exp(-np.mean(log_probs[valid_labels]))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "perplexity": perplexity,
    }

In [13]:
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

In [14]:
import torch
print(torch.cuda.is_available())


True


In [15]:
from transformers import TrainingArguments, Trainer
import torch
from datetime import datetime

# Memory management utility
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Set up GPU for training
torch.backends.cuda.matmul.allow_tf32 = True

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()

# Move model to GPU for training
model.to(torch.device("cuda"))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Disable automatic evaluation
    learning_rate=3e-5,
    per_device_train_batch_size=8,  # Reduce if memory issues persist
    per_device_eval_batch_size=4,  # Smaller batch size for evaluation
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    optim="adamw_torch",  # Optimizer for low-memory scenarios
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    fp16=True,  # Mixed precision training
    load_best_model_at_end=True,  # Enable automatic loading
    # load_best_model_at_end=False,  # Disable automatic loading
    eval_accumulation_steps = 50,
)

# Split dataset into train and validation
train_val_split = hf_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Move validation dataset to CPU
val_dataset = val_dataset.map(lambda x: {k: torch.tensor(v).to("cpu") for k, v in x.items()})

from transformers import BertForTokenClassification, BertTokenizerFast

# Load the model
model_path = "./best_model_20241202_220941"
model = BertForTokenClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=None,  # No automatic evaluation during training
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

# best_model_path = None
best_metric = float("inf")  # Assuming lower metric is better (e.g., loss)

model.to(torch.device("cuda"))
clear_memory()
trainer.train()
# trainer.train(resume_from_checkpoint="./results/checkpoint-11030")

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
best_model_path = f"./best_model_{current_timestamp}"
print(f"New best model found! Saving to {best_model_path}")
trainer.save_model(best_model_path)

# Final best model path
print(f"Best model saved at: {best_model_path}")


Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 35317
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 1103
  Number of trainable parameters = 353550888
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Perplexity
0,3.4997,5.492875,0.43212,0.367997,0.43212,0.385717,7.721401


***** Running Evaluation *****
  Num examples = 1859
  Batch size = 4
Saving model checkpoint to ./results/checkpoint-1103
Configuration saved in ./results/checkpoint-1103/config.json
Model weights saved in ./results/checkpoint-1103/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1103/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1103/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-1103 (score: 5.492874622344971).
Saving model checkpoint to ./best_model_20241207_000429
Configuration saved in ./best_model_20241207_000429/config.json


New best model found! Saving to ./best_model_20241207_000429


Model weights saved in ./best_model_20241207_000429/pytorch_model.bin
tokenizer config file saved in ./best_model_20241207_000429/tokenizer_config.json
Special tokens file saved in ./best_model_20241207_000429/special_tokens_map.json


Best model saved at: ./best_model_20241207_000429


In [None]:
from transformers import TrainingArguments, Trainer
import torch
from datetime import datetime

# Memory management utility
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Set up GPU for training
torch.backends.cuda.matmul.allow_tf32 = True

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()

# Move model to GPU for training
model.to(torch.device("cuda"))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Disable automatic evaluation
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce if memory issues persist
    per_device_eval_batch_size=4,  # Smaller batch size for evaluation
    num_train_epochs=15,
    weight_decay=0.01,
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    optim="adamw_torch",  # Optimizer for low-memory scenarios
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    fp16=True,  # Mixed precision training
    load_best_model_at_end=True,  # Enable automatic loading
    # load_best_model_at_end=False,  # Disable automatic loading
    eval_accumulation_steps = 50,
)

# Split dataset into train and validation
train_val_split = hf_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Move validation dataset to CPU
val_dataset = val_dataset.map(lambda x: {k: torch.tensor(v).to("cpu") for k, v in x.items()})

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=None,  # No automatic evaluation during training
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

# best_model_path = None
best_metric = float("inf")  # Assuming lower metric is better (e.g., loss)

model.to(torch.device("cuda"))
clear_memory()
trainer.train()
# trainer.train(resume_from_checkpoint="./results/checkpoint-11030")

# current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# best_model_path = f"./best_model_{current_timestamp}"
# print(f"New best model found! Saving to {best_model_path}")
# trainer.save_model(best_model_path)

# # Final best model path
# print(f"Best model saved at: {best_model_path}")


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

Using cuda_amp half precision backend
  self.scaler = torch.cuda.amp.GradScaler()
***** Running training *****
  Num examples = 35317
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 16545
  Number of trainable parameters = 353550888
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Perplexity
0,4.1224,4.865142,0.24826,0.35727,0.277423,4.695312
1,3.7659,4.610922,0.27663,0.384134,0.306743,4.472656
2,3.3685,4.515942,0.300286,0.401538,0.328368,3.800781
3,3.1027,4.342058,0.322309,0.420123,0.350135,3.525391
4,2.8865,4.252907,0.347693,0.435583,0.371648,3.337891
5,2.5862,4.221997,0.358819,0.441159,0.380455,3.0
6,2.3808,4.222005,0.375421,0.454169,0.395762,2.929688
7,2.2836,4.102341,0.396843,0.469629,0.41396,2.853516
8,2.0894,4.11548,0.396284,0.46946,0.414239,2.650391
9,1.9728,4.11438,0.423233,0.489651,0.43811,2.574219


***** Running Evaluation *****
  Num examples = 1859
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-1103
Configuration saved in ./results/checkpoint-1103/config.json
Model weights saved in ./results/checkpoint-1103/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1103/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1103/special_tokens_map.json
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
  return fn(*args, **kwargs)
***** Running Evaluation *****
  Num examples = 1859
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-2206
Configuration saved in ./results/checkpoint-2206/config.json
Model weights saved in ./results/checkpoint-2206/pytorch_model.bin
tokenizer

TrainOutput(global_step=16545, training_loss=2.5204320230308204, metrics={'train_runtime': 8451.9391, 'train_samples_per_second': 62.679, 'train_steps_per_second': 1.958, 'total_flos': 1.3090753681429709e+17, 'train_loss': 2.5204320230308204, 'epoch': 15.0})

In [None]:
eval_results = trainer.evaluate(val_dataset)
print(f"Evaluation Results: {eval_results}")


***** Running Evaluation *****
  Num examples = 1859
  Batch size = 4
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Results: {'eval_loss': 14.762930870056152, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_perplexity': 2.052776336669922, 'eval_runtime': 35.2721, 'eval_samples_per_second': 52.705, 'eval_steps_per_second': 13.183, 'epoch': 10.0}


In [None]:
############################

import torch
from sklearn.metrics import precision_recall_fscore_support, classification_report
import numpy as np

def clear_memory():
    """Clear CUDA memory to prevent memory overflow."""
    torch.cuda.empty_cache()
    print("CUDA memory cleared.")

# Set evaluation batch size and accumulation steps
training_args.per_device_eval_batch_size = 1
training_args.eval_accumulation_steps = 10
torch.cuda.empty_cache()  # Clear memory

# Define chunk size
chunk_size = 1000
num_chunks = (len(val_dataset) + chunk_size - 1) // chunk_size  # Total number of chunks

# Accumulate metrics
all_metrics = []

for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(val_dataset))
    
    # Select a chunk of the dataset
    val_dataset_chunk = val_dataset.select(range(start_idx, end_idx))
    
    print(f"\nEvaluating chunk {i + 1}/{num_chunks} (Samples {start_idx}-{end_idx})")
    
    # Evaluate the chunk
    eval_results = trainer.evaluate(val_dataset_chunk)
    print(f"Chunk {i + 1} Results: {eval_results}")
    
    # Save the metrics for this chunk
    all_metrics.append(eval_results)

# Aggregate results
final_metrics = {}
for key in all_metrics[0].keys():
    if isinstance(all_metrics[0][key], (int, float)):  # Aggregate numerical metrics
        final_metrics[key] = sum(d[key] for d in all_metrics) / len(all_metrics)

print("\nFinal Evaluation Metrics for the Entire Dataset:")
for key, value in final_metrics.items():
    print(f"{key}: {value:.4f}")

# Custom evaluation loop to handle alignment and detailed metrics
def custom_evaluation_loop(model, tokenizer, dataset, device="cuda"):
    """Evaluate model and compute precision, recall, and F1."""
    model.eval()  # Set model to evaluation mode
    all_true_labels = []
    all_predictions = []

    for sample in dataset:
        # Prepare input tensors
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device)
        true_labels = np.array(sample["labels"])  # Convert to NumPy for easier filtering

        # Filter out ignored tokens (-100)
        valid_indices = true_labels != -100
        true_labels = true_labels[valid_indices]

        # Generate predictions
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(0).cpu().numpy()
        predictions = np.argmax(logits, axis=1)[valid_indices]

        # Append results
        all_true_labels.extend(true_labels)
        all_predictions.extend(predictions)

    # Compute evaluation metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true_labels, all_predictions, average="weighted", zero_division=0
    )
    print("\nDetailed Evaluation Metrics:")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(all_true_labels, all_predictions))

# Evaluate using the custom loop
# clear_memory()
# print("\nRunning Custom Evaluation Loop:")
# custom_evaluation_loop(model, tokenizer, val_dataset, device="cuda")

***** Running Evaluation *****
  Num examples = 558
  Batch size = 1



Evaluating chunk 1/1 (Samples 0-558)


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
import xml.etree.ElementTree as ET
import pandas as pd

# Updated function for processing and mapping synsets with OOV handling
def process_corpus_sections_incrementally_with_synsets(
    xml_path, model, tokenizer, known_synset_embeddings, chunk_size=10000
):
    """
    Process large XML datasets incrementally with OOV handling for synset mapping.

    Args:
        xml_path (str): Path to the XML file.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used with the model.
        known_synset_embeddings (dict): Precomputed embeddings for known synsets.
        chunk_size (int): Number of sentences per chunk.

    Returns:
        List[pd.DataFrame]: List of DataFrames, one per processed chunk.
    """
    print(f"Processing <corpus> sections from: {xml_path}")

    corpus_dfs = []  # Store data frames for each <corpus> section
    texts, labels = [], []
    inside_corpus = False
    inside_text = False
    corpus_count = 0
    chunk_counter = 0

    # Open the file and read it line-by-line
    with open(xml_path, "r") as file:
        for line in file:
            # Check for the start of a <corpus> section
            if "<corpus" in line:
                inside_corpus = True
                texts, labels = [], []  # Reset lists for a new <corpus>
                corpus_count += 1
                print(f"\nProcessing <corpus> section {corpus_count}...")

            # Process each line only if we're inside a <corpus> section
            if inside_corpus:
                if "<text" in line:
                    inside_text = True  # Start of a new <text> element
                    text_buffer = [line]  # Reset the buffer

                elif inside_text:
                    text_buffer.append(line)  # Accumulate lines within <text>

                    if "</text>" in line:  # End of <text> element
                        inside_text = False
                        # Parse the accumulated <text> element
                        text_xml = "".join(text_buffer)
                        text_elem = ET.fromstring("<root>" + text_xml + "</root>")  # Wrap for valid XML

                        # Process each sentence in the <text> element
                        for sentence in text_elem.findall(".//sentence"):
                            sentence_text = []
                            sentence_labels = []

                            # Extract words from <wf> and <instance> elements
                            for word_elem in sentence:
                                if word_elem.tag == "wf":
                                    sentence_text.append(word_elem.text)
                                elif word_elem.tag == "instance":
                                    sentence_text.append(word_elem.text)
                                    # Reuse get_synset with OOV handling
                                    synset = get_synset(
                                        word_elem.attrib.get("lemma", ""),
                                        word_elem.attrib.get("pos", ""),
                                        model,
                                        tokenizer,
                                        known_synset_embeddings
                                    )
                                    sentence_labels.append({
                                        "id": word_elem.attrib.get("id", ""),
                                        "lemma": word_elem.attrib.get("lemma", ""),
                                        "pos": word_elem.attrib.get("pos", ""),
                                        "synset": synset
                                    })

                            # Append extracted sentence data to texts and labels
                            if sentence_text:
                                texts.append(" ".join(sentence_text))
                                labels.append(sentence_labels)

                        # Check if we've reached the chunk size limit
                        if len(texts) >= chunk_size:
                            # Save the chunk to a DataFrame and clear memory
                            corpus_df = pd.DataFrame({"text": texts, "labels": labels})
                            corpus_dfs.append(corpus_df)
                            print(f"Processed {chunk_counter + 1} chunks of {chunk_size} records.")
                            chunk_counter += 1
                            texts, labels = [], []  # Reset lists for the next chunk

            # Check for the end of a <corpus> section
            if "</corpus>" in line and inside_corpus:
                inside_corpus = False
                # Save any remaining data after the last chunk
                if texts and labels:
                    corpus_df = pd.DataFrame({"text": texts, "labels": labels})
                    corpus_dfs.append(corpus_df)
                    print(f"Final chunk for <corpus> section {corpus_count}.")
                    texts, labels = [], []  # Correctly reset lists for the next corpus section

    return corpus_dfs


In [None]:
#Testing

import xml.etree.ElementTree as ET
import pandas as pd

# Updated function for processing and mapping synsets with OOV handling
def process_corpus_sections_incrementally_with_synsets(
    xml_path, model, tokenizer, known_synset_embeddings, chunk_size=10000, max_records=None
):
    """
    Process large XML datasets incrementally with OOV handling for synset mapping.

    Args:
        xml_path (str): Path to the XML file.
        model: Fine-tuned BERT model.
        tokenizer: Tokenizer used with the model.
        known_synset_embeddings (dict): Precomputed embeddings for known synsets.
        chunk_size (int): Number of sentences per chunk.

    Returns:
        List[pd.DataFrame]: List of DataFrames, one per processed chunk.
    """
    print(f"Processing <corpus> sections from: {xml_path}")

    corpus_dfs = []  # Store data frames for each <corpus> section
    texts, labels = [], []
    inside_corpus = False
    inside_text = False
    corpus_count = 0
    chunk_counter = 0
    total_records = 0  # Track the total number of records processeds

    # Open the file and read it line-by-line
    with open(xml_path, "r") as file:
        for line in file:
            
            # Stop processing if the total record limit is reached
            if max_records is not None and total_records >= max_records:
                print(f"Reached the maximum record limit: {max_records}. Stopping processing.")
                break
            
            # Check for the start of a <corpus> section
            if "<corpus" in line:
                inside_corpus = True
                texts, labels = [], []  # Reset lists for a new <corpus>
                corpus_count += 1
                print(f"\nProcessing <corpus> section {corpus_count}...")

            # Process each line only if we're inside a <corpus> section
            if inside_corpus:
                if "<text" in line:
                    inside_text = True  # Start of a new <text> element
                    text_buffer = [line]  # Reset the buffer

                elif inside_text:
                    text_buffer.append(line)  # Accumulate lines within <text>

                    if "</text>" in line:  # End of <text> element
                        inside_text = False
                        # Parse the accumulated <text> element
                        text_xml = "".join(text_buffer)
                        text_elem = ET.fromstring("<root>" + text_xml + "</root>")  # Wrap for valid XML

                        # Process each sentence in the <text> element
                        for sentence in text_elem.findall(".//sentence"):
                            sentence_text = []
                            sentence_labels = []

                            # Extract words from <wf> and <instance> elements
                            for word_elem in sentence:
                                if word_elem.tag == "wf":
                                    sentence_text.append(word_elem.text)
                                elif word_elem.tag == "instance":
                                    sentence_text.append(word_elem.text)
                                    # Reuse get_synset with OOV handling
                                    synset = get_synset(
                                        word_elem.attrib.get("lemma", ""),
                                        word_elem.attrib.get("pos", ""),
                                        model,
                                        tokenizer,
                                        known_synset_embeddings
                                    )
                                    sentence_labels.append({
                                        "id": word_elem.attrib.get("id", ""),
                                        "lemma": word_elem.attrib.get("lemma", ""),
                                        "pos": word_elem.attrib.get("pos", ""),
                                        "synset": synset
                                    })

                            # Append extracted sentence data to texts and labels
                            if sentence_text:
                                texts.append(" ".join(sentence_text))
                                labels.append(sentence_labels)

                        # Check if we've reached the chunk size limit
                        if len(texts) >= chunk_size:
                            # Save the chunk to a DataFrame and clear memory
                            corpus_df = pd.DataFrame({"text": texts, "labels": labels})
                            corpus_dfs.append(corpus_df)
                            print(f"Processed {chunk_counter + 1} chunks of {chunk_size} records.")
                            chunk_counter += 1
                            texts, labels = [], []  # Reset lists for the next chunk

            # Check for the end of a <corpus> section
            if "</corpus>" in line and inside_corpus:
                break # My Testing Please remove later
                inside_corpus = False
                # Save any remaining data after the last chunk
                if texts and labels:
                    corpus_df = pd.DataFrame({"text": texts, "labels": labels})
                    corpus_dfs.append(corpus_df)
                    print(f"Final chunk for <corpus> section {corpus_count}.")
                    texts, labels = [], []  # Correctly reset lists for the next corpus section

    return corpus_dfs


In [None]:
xml_path = "./WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.data.xml"
# xml_path = "cleaned_file.xml"


test_corpus_dfs = process_corpus_sections_incrementally_with_synsets(
    xml_path, model, tokenizer, known_synset_embeddings, chunk_size=5000, max_records=30000
)

# Combine all chunks into a single DataFrame (if memory permits)
combined_test_df = pd.concat(test_corpus_dfs, ignore_index=True)
print("Test dataset preview:")
print(combined_test_df.head())


Processing <corpus> sections from: ./WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.data.xml

Processing <corpus> section 1...
Processed 1 chunks of 5000 records.
Processed 2 chunks of 5000 records.
Processed 3 chunks of 5000 records.
Processed 4 chunks of 5000 records.
Processed 5 chunks of 5000 records.
Processed 6 chunks of 5000 records.
Processed 7 chunks of 5000 records.
Test dataset preview:
                                                text  \
0  How long has it been since you reviewed the ob...   
1  Have you permitted it to become a giveaway pro...   
2  What effort do you make to assess results of y...   
3  Do you measure its relation to reduced absente...   
4  Have you set specific objectives for your empl...   

                                              labels  
0  [{'id': 'd000.s000.t000', 'lemma': 'long', 'po...  
1  [{'id': 'd000.s001.t000', 'lemma': 'permit', '...  
2  [{'id': 'd000.s002.t000', 'lemma': 'effort', '...  
3  [{'id': 'd000.s003.t000', 'lemma': 'mea

In [None]:
tokenized_test_data = preprocess_dataset(combined_test_df, tokenizer, synset_to_id)
hf_test_dataset = convert_to_hf_dataset(tokenized_test_data)


In [None]:
print("Preprocessed Testing Dataset Sample:")
print(hf_test_dataset[10])

Preprocessed Testing Dataset Sample:
{'input_ids': [101, 2003, 2115, 13131, 4005, 5378, 2205, 2172, 2489, 9343, 2326, 2005, 5126, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 9675, 12726, 10072, 1111, 16729, 18231, 672, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [None]:
# fine_tuned_model = BertForTokenClassification.from_pretrained("./results")
clear_memory()
test_results = trainer.evaluate(hf_test_dataset.select(range(1000)))
print(f"Test Results: {test_results}")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 4
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Results: {'eval_loss': 12.825652122497559, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_perplexity': 5.02734375, 'eval_runtime': 43.0548, 'eval_samples_per_second': 23.226, 'eval_steps_per_second': 5.807, 'epoch': 15.0}


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


INFERENCE

In [16]:
# from transformers import TrainingArguments, Trainer
# import torch

# trainer = Trainer()

# trainer.save_model("./best_model_20241202_145605")

from transformers import BertForTokenClassification, BertTokenizerFast

# Load the model
model_path = "./best_model_20241202_175204"
model = BertForTokenClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)



In [17]:
from nltk.corpus import wordnet as wn

def get_synset_details(word, synset_name):
    """
    Fetch details of the synset: definition and examples.

    Args:
        word (str): The word to find the synset for.
        synset_name (str): The WordNet synset name (e.g., 'dog.n.01').

    Returns:
        dict: Synset details including definition and examples.
    """
    try:
        synset = wn.synset(synset_name)
        return {
            "word": word,
            "definition": synset.definition(),
            "examples": synset.examples()
        }
    except Exception as e:
        print(f"Error fetching synset details: {e}")
        return None


In [18]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

def predict_synsets(sentence, model, tokenizer, id_to_synset, max_length=128):
    """
    Predict WordNet synsets for each word in a given sentence using the fine-tuned model.
    """
    # Determine the device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to the device

    # Tokenize the input sentence
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    
    # Move the tokenized inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the word IDs for alignment
    word_ids = inputs['input_ids'][0].tolist()  # Assuming batch size of 1

    # Perform inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predictions (logits -> argmax)
    logits = outputs.logits
    predictions = logits.argmax(dim=-1).squeeze().tolist()

    # Map predictions to WordNet synsets
    tokens = tokenizer.tokenize(sentence)
    predicted_synsets = []
    for word_id, pred in zip(word_ids, predictions):
        if word_id is not None and pred in id_to_synset:
            predicted_synsets.append(id_to_synset[pred])
        else:
            predicted_synsets.append("UNK")  # Unknown or ignored token

    # Align tokens with synsets
    result = []
    for token, synset in zip(tokens, predicted_synsets):
        result.append((token, synset))

    return result


In [23]:
# Prompt user for input
# sentence = input("Enter a sentence: Everything happens for its own good.")
# sentence = "My money is in the bank"
# sentence = "We saw ducks near the bank"
sentence = "Star wars is a good movie"

word = "movie"


# Predict synsets
predictions = predict_synsets(sentence, model, tokenizer, id_to_synset)


# Display results
print("\nPredicted Synsets:")
for token, synset in predictions:
    print(f"{token}: {synset}")

print(predictions)
    
print("\nPredicted Synsets and Definitions:")
for token, synset_name in predictions:
    if(token == word):
        definition = get_synset_details(token, synset_name)
        print(definition)
    # else:
    #     print("No synset found")
    # if synset_name != "UNK":  # If the synset is valid
    #     definition = get_synset_details(token, synset_name)
    #     print(f"{token}: {synset_name} - {definition}")
    # else:
    #     print(f"{token}: {synset_name} - No synset found")


Predicted Synsets:
star: slave.v.01
wars: slave.v.01
is: slave.v.01
a: bottomless.s.01
good: neuromuscular.a.01
movie: neuromuscular.a.01
[('star', 'slave.v.01'), ('wars', 'slave.v.01'), ('is', 'slave.v.01'), ('a', 'bottomless.s.01'), ('good', 'neuromuscular.a.01'), ('movie', 'neuromuscular.a.01')]

Predicted Synsets and Definitions:
{'word': 'movie', 'definition': 'affecting or characteristic of both neural and muscular tissue', 'examples': []}
