In [1]:
from transformers import BertForTokenClassification, BertTokenizerFast

In [2]:
from datasets import Dataset

In [3]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/svaidya4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/svaidya4/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
from io import StringIO
from nltk.corpus import wordnet as wn

# Helper function to get WordNet synset
def get_synset(lemma, pos):
    """
    Map a lemma and POS to its corresponding WordNet synset.
    """
    pos_map = {
        "NOUN": wn.NOUN,
        "VERB": wn.VERB,
        "ADJ": wn.ADJ,
        "ADV": wn.ADV
    }
    wn_pos = pos_map.get(pos)
    if wn_pos:
        synsets = wn.synsets(lemma, pos=wn_pos)
        if synsets:
            return synsets[0].name()  # Return the first synset
    return "UNK"  # Unknown synset

def process_corpus_incrementally(xml_path, batch_size=5000, checkpoint_dir="checkpoints"):
    """
    Incrementally process the XML file with </corpus> tag using batches.
    """
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    corpus_count = 0
    batch_count = 0
    texts, labels = [], []
    buffer = []
    inside_corpus = False

    with open(xml_path, "r") as file:
        for line in file:
            buffer.append(line)

            # Detect the start of a corpus
            if "<corpus" in line:
                inside_corpus = True

            # Detect the end of a corpus
            if "</corpus>" in line and inside_corpus:
                inside_corpus = False
                corpus_count += 1
                print(f"Processing <corpus> section {corpus_count}...")

                # Process the accumulated corpus buffer
                corpus_data = "".join(buffer)
                buffer = []  # Clear buffer for the next corpus

                # Use StringIO to provide a file-like object for iterparse
                corpus_file = StringIO(corpus_data)
                context = ET.iterparse(corpus_file, events=("start", "end"))
                context = iter(context)

                for event, elem in context:
                    if event == "end" and elem.tag == "sentence":
                        sentence_text = []
                        sentence_labels = []

                        # Extract words from <wf> and <instance> elements
                        for word_elem in elem:
                            if word_elem.tag == "wf":
                                sentence_text.append(word_elem.text)
                            elif word_elem.tag == "instance":
                                sentence_text.append(word_elem.text)
                                synset = get_synset(word_elem.attrib.get("lemma", ""), word_elem.attrib.get("pos", ""))
                                sentence_labels.append({
                                    "id": word_elem.attrib.get("id", ""),
                                    "lemma": word_elem.attrib.get("lemma", ""),
                                    "pos": word_elem.attrib.get("pos", ""),
                                    "synset": synset
                                })

                        # Append extracted sentence data to lists
                        if sentence_text:
                            texts.append(" ".join(sentence_text))
                            labels.append(sentence_labels)

                        # Free memory for processed elements
                        elem.clear()

                    # Save a batch when the batch size is reached
                    if len(texts) >= batch_size:
                        batch_count += 1
                        checkpoint_path = os.path.join(checkpoint_dir, f"batch_{corpus_count}_{batch_count}.csv")
                        save_checkpoint(texts, labels, checkpoint_path)
                        texts, labels = [], []  # Clear batch memory

                # Save remaining data in this corpus
                if texts:
                    batch_count += 1
                    checkpoint_path = os.path.join(checkpoint_dir, f"batch_{corpus_count}_{batch_count}.csv")
                    save_checkpoint(texts, labels, checkpoint_path)
                    texts, labels = [], []

    print(f"Processed {corpus_count} <corpus> sections.")

def save_checkpoint(texts, labels, file_path):
    """
    Save a batch of processed data to a checkpoint CSV.
    """
    df = pd.DataFrame({"text": texts, "labels": labels})
    df.to_csv(file_path, index=False)
    print(f"Saved checkpoint to {file_path}")

def load_checkpoints(checkpoint_dir="checkpoints"):
    """
    Load all checkpoint files and combine them into a single DataFrame.
    """
    dfs = []
    for file_name in sorted(os.listdir(checkpoint_dir)):
        if file_name.endswith(".csv"):
            file_path = os.path.join(checkpoint_dir, file_name)
            dfs.append(pd.read_csv(file_path))
    return pd.concat(dfs, ignore_index=True)


In [5]:

# # Path to the XML file
# xml_path = "./WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.data.xml"
# # Incrementally process the XML file and save checkpoints
# process_corpus_incrementally(xml_path, batch_size=5000, checkpoint_dir="checkpoints")

#Combine all checkpoints into a single DataFrame
combined_df = load_checkpoints("checkpoints")

# Preview the combined DataFrame
print("Combined dataset preview:")
print(combined_df.head())

Combined dataset preview:
                                                text  \
0  How long has it been since you reviewed the ob...   
1  Have you permitted it to become a giveaway pro...   
2  What effort do you make to assess results of y...   
3  Do you measure its relation to reduced absente...   
4  Have you set specific objectives for your empl...   

                                              labels  
0  [{'id': 'd000.s000.t000', 'lemma': 'long', 'po...  
1  [{'id': 'd000.s001.t000', 'lemma': 'permit', '...  
2  [{'id': 'd000.s002.t000', 'lemma': 'effort', '...  
3  [{'id': 'd000.s003.t000', 'lemma': 'measure', ...  
4  [{'id': 'd000.s004.t000', 'lemma': 'set', 'pos...  


In [6]:
import ast  # To safely evaluate string representations of lists/dicts

def extract_unique_synsets(combined_df):
    """
    Extract unique synsets from the combined DataFrame.

    Args:
        combined_df (pd.DataFrame): The combined DataFrame with a 'labels' column.

    Returns:
        set: A set of unique synsets.
    """
    unique_synsets = set()

    for labels in combined_df['labels']:
        # Convert the string representation of labels back to a Python list
        try:
            label_list = ast.literal_eval(labels)
        except ValueError as e:
            print(f"Error parsing labels: {e}")
            continue

        for label in label_list:
            synset = label.get('synset', 'UNK')  # Extract synset
            if synset != 'UNK':  # Exclude unknown synsets
                unique_synsets.add(synset)

    return unique_synsets


In [7]:
# Extract unique synsets from the combined DataFrame
unique_synsets = extract_unique_synsets(combined_df)

print(len(unique_synsets))

synset_to_id = {synset: idx for idx, synset in enumerate(unique_synsets)}
id_to_synset = {idx: synset for synset, idx in synset_to_id.items()}


18985


In [8]:
# from transformers import BertForTokenClassification, BertTokenizerFast
import torch

# Initialize model and tokenizer
model = BertForTokenClassification.from_pretrained("bert-large-uncased", num_labels=len(synset_to_id))
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")

# Precompute embeddings for known synsets
known_synset_embeddings = {
    synset: model.get_input_embeddings()(
        torch.tensor([tokenizer.convert_tokens_to_ids(synset)])
    ).detach().numpy()
    for synset in synset_to_id.keys()
}


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized

In [9]:
import ast

def preprocess_dataset(data_df, tokenizer, synset_to_id, max_length=128):
    """
    Preprocess the dataset by tokenizing the text and aligning labels.

    Args:
        data_df (pd.DataFrame): Parsed dataset with 'text' and 'labels'.
        tokenizer: BERT tokenizer.
        synset_to_id (dict): Mapping of synsets to numeric IDs.
        max_length (int): Maximum sequence length for tokenized inputs.

    Returns:
        List[dict]: Preprocessed data with tokenized inputs and aligned labels.
    """
    tokenized_data = []

    for _, row in data_df.iterrows():
        text = row["text"]
        
        # Parse the string representation of labels into a Python list
        try:
            labels = ast.literal_eval(row["labels"])
        except ValueError as e:
            print(f"Error parsing labels: {e}")
            continue

        # Tokenize the text
        tokenized_inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        word_ids = tokenized_inputs.word_ids(batch_index=0)

        # Map synsets to numeric IDs
        label_dict = {
            int(label["id"].split(".")[-1].lstrip("t")): synset_to_id.get(label["synset"], -100)
            for label in labels
        }

        # Align labels with tokens
        token_labels = []
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                token_labels.append(-100)
            elif word_id in label_dict:
                token_labels.append(label_dict[word_id])
            else:
                token_labels.append(-100)

        tokenized_inputs["labels"] = token_labels
        tokenized_data.append({
            "input_ids": tokenized_inputs["input_ids"].squeeze().tolist(),
            "attention_mask": tokenized_inputs["attention_mask"].squeeze().tolist(),
            "labels": token_labels
        })

    return tokenized_data


In [None]:
# Not required, Just load the dataframe
def convert_to_hf_dataset(tokenized_data):
    return Dataset.from_dict({
        "input_ids": [data["input_ids"] for data in tokenized_data],
        "attention_mask": [data["attention_mask"] for data in tokenized_data],
        "labels": [data["labels"] for data in tokenized_data],
    })

# Preprocess the parsed data
tokenized_data = preprocess_dataset(combined_df, tokenizer, synset_to_id)

# Convert to Hugging Face Dataset
hf_dataset = convert_to_hf_dataset(tokenized_data)

In [None]:
# Not required
# Save the HF Dataset

hf_dataset.to_csv("HF_Dataframe.csv", index=False)


In [10]:
# Load the CSV file into a DataFrame

import re
import pandas as pd
from datasets import Dataset

dataset = pd.read_csv("HF_Dataframe.csv")

# Define patterns for cleaning
pattern = re.compile(r"\s+")  # Matches multiple spaces and \n
bracket_pattern = re.compile(r"^\[|\]$")  # Matches opening and closing square brackets at the start or end of a string

# Function to clean spaces, newlines, and remove brackets
def clean_and_strip(series):
    return series.map(
        lambda x: bracket_pattern.sub("", pattern.sub(" ", x)).strip() if isinstance(x, str) else x
    )

# Function to split cleaned strings into lists of integers
def split_to_int_list(series):
    return series.map(
        lambda x: [int(i) for i in x.split()] if isinstance(x, str) else x
    )

# Apply cleaning to all relevant columns
dataset["input_ids"] = clean_and_strip(dataset["input_ids"])
dataset["attention_mask"] = clean_and_strip(dataset["attention_mask"])
dataset["labels"] = clean_and_strip(dataset["labels"])

# Convert cleaned strings into lists of integers
dataset["input_ids"] = split_to_int_list(dataset["input_ids"])
dataset["attention_mask"] = split_to_int_list(dataset["attention_mask"])
dataset["labels"] = split_to_int_list(dataset["labels"])

# Convert Pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Display the first row of the Hugging Face Dataset
print(hf_dataset[0])


{'input_ids': [101, 2129, 2146, 2038, 2009, 2042, 2144, 2017, 8182, 1996, 11100, 1997, 2115, 5770, 1998, 2326, 2565, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 15904, 7188, 3700, 2869, 11771, 17452, 3663, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 

In [11]:
print(hf_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 850974
})


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import torch
from transformers import Trainer, TrainingArguments

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    import numpy as np
    import torch

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Mask ignored labels
    labels = np.array(labels)
    valid_labels = labels != -100
    true_labels = labels[valid_labels]
    true_predictions = predictions[valid_labels]

    # Calculate Metrics
    accuracy = accuracy_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions, average="weighted", zero_division=0)
    recall = recall_score(true_labels, true_predictions, average="weighted", zero_division=0)
    f1 = f1_score(true_labels, true_predictions, average="weighted")

    # Calculate Perplexity
    logits_tensor = torch.tensor(logits).float()  # Ensure logits are float32
    with torch.cuda.amp.autocast(enabled=False):  # Disable mixed precision for softmax
        probs = torch.softmax(logits_tensor, dim=-1).numpy()
    log_probs = np.log(np.max(probs, axis=-1) + 1e-9)  # Adding epsilon for numerical stability
    perplexity = np.exp(-np.mean(log_probs[valid_labels]))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "perplexity": perplexity,
    }



In [13]:
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

In [14]:
import torch
print(torch.cuda.is_available())


True


In [15]:
# Randomly split the dataset into 70% (discarded) and 30% (used for training)
split_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

hf_dataset_split = split_dataset["test"]

In [16]:
from transformers import TrainingArguments, Trainer
import torch
from datetime import datetime

# clear_memory()

# Set up GPU for training
torch.backends.cuda.matmul.allow_tf32 = True

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()

# Move model to GPU for training
model.to(torch.device("cuda"))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_",
    evaluation_strategy="epoch",  # Disable automatic evaluation
    # evaluation_strategy="no",  # Disable automatic evaluation
    learning_rate=5e-5,
    per_device_train_batch_size=8,  # Reduce if memory issues persist
    per_device_eval_batch_size=4,  # Smaller batch size for evaluation
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    optim="adamw_torch",  # Optimizer for low-memory scenarios
    gradient_accumulation_steps=8,  # Simulate larger batch sizes
    fp16=True,  # Mixed precision training
    load_best_model_at_end=True,  # Enable automatic loading
    # load_best_model_at_end=False,  # Disable automatic loading
    eval_accumulation_steps = 50,
    lr_scheduler_type="cosine",
)

# Split dataset into train and validation
train_val_split = hf_dataset_split.train_test_split(test_size=0.0125, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Move validation dataset to CPU
val_dataset = val_dataset.map(lambda x: {k: torch.tensor(v).to("cpu") for k, v in x.items()})

# from transformers import BertForTokenClassification, BertTokenizerFast

# Load the model
model_path = "./best_model_20241206_165542"
model = BertForTokenClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=None,  # No automatic evaluation during training
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

best_metric = float("inf")  # Assuming lower metric is better (e.g., loss)

model.to(torch.device("cuda"))
clear_memory()
trainer.train()
# trainer.train(resume_from_checkpoint="./results_/checkpoint-1323")

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
best_model_path = f"./best_model_{current_timestamp}"
print(f"New best model found! Saving to {best_model_path}")
trainer.save_model(best_model_path)

# Final best model path
print(f"Best model saved at: {best_model_path}")


Map:   0%|          | 0/2128 [00:00<?, ? examples/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 168067
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 2626
  Number of trainable parameters = 353551913
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Perplexity
0,0.6978,1.253096,0.669673,0.66543,0.669673,0.644817,1.521244


***** Running Evaluation *****
  Num examples = 2128
  Batch size = 4
Saving model checkpoint to ./results_/checkpoint-2626
Configuration saved in ./results_/checkpoint-2626/config.json
Model weights saved in ./results_/checkpoint-2626/pytorch_model.bin
tokenizer config file saved in ./results_/checkpoint-2626/tokenizer_config.json
Special tokens file saved in ./results_/checkpoint-2626/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results_/checkpoint-2626 (score: 1.2530956268310547).
Saving model checkpoint to ./best_model_20241206_185244
Configuration saved in ./best_model_20241206_185244/config.json


New best model found! Saving to ./best_model_20241206_185244


Model weights saved in ./best_model_20241206_185244/pytorch_model.bin
tokenizer config file saved in ./best_model_20241206_185244/tokenizer_config.json
Special tokens file saved in ./best_model_20241206_185244/special_tokens_map.json


Best model saved at: ./best_model_20241206_185244


In [17]:
from transformers import TrainingArguments, Trainer
import torch
from datetime import datetime

# clear_memory()

# Set up GPU for training
torch.backends.cuda.matmul.allow_tf32 = True

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()

# Move model to GPU for training
model.to(torch.device("cuda"))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Disable automatic evaluation
    # evaluation_strategy="no",  # Disable automatic evaluation
    learning_rate=1e-5,
    per_device_train_batch_size=8,  # Reduce if memory issues persist
    per_device_eval_batch_size=4,  # Smaller batch size for evaluation
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    optim="adamw_torch",  # Optimizer for low-memory scenarios
    gradient_accumulation_steps=8,  # Simulate larger batch sizes
    fp16=True,  # Mixed precision training
    load_best_model_at_end=True,  # Enable automatic loading
    # load_best_model_at_end=False,  # Disable automatic loading
    eval_accumulation_steps = 50,
    lr_scheduler_type="cosine",
)

# Split dataset into train and validation
train_val_split = hf_dataset_split.train_test_split(test_size=0.00125, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Move validation dataset to CPU
val_dataset = val_dataset.map(lambda x: {k: torch.tensor(v).to("cpu") for k, v in x.items()})

# from transformers import BertForTokenClassification, BertTokenizerFast

# # Load the model
# model_path = "./best_model_20241204_114605"
# model = BertForTokenClassification.from_pretrained(model_path)

# # Load the tokenizer
# tokenizer = BertTokenizerFast.from_pretrained(model_path)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=None,  # No automatic evaluation during training
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

best_metric = float("inf")  # Assuming lower metric is better (e.g., loss)

model.to(torch.device("cuda"))
clear_memory()
trainer.train()
# trainer.train(resume_from_checkpoint="./results/checkpoint-11030")

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
best_model_path = f"./best_model_{current_timestamp}"
print(f"New best model found! Saving to {best_model_path}")
trainer.save_model(best_model_path)

# Final best model path
print(f"Best model saved at: {best_model_path}")


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 169982
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 2656
  Number of trainable parameters = 353551913


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Perplexity
1,0.7352,1.317335,0.676806,0.645754,0.676806,0.648687,1.335419


***** Running Evaluation *****
  Num examples = 213
  Batch size = 4
Saving model checkpoint to ./results/checkpoint-2656
Configuration saved in ./results/checkpoint-2656/config.json
Model weights saved in ./results/checkpoint-2656/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2656/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2656/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-2656 (score: 1.3173346519470215).
Saving model checkpoint to ./best_model_20241206_194927
Configuration saved in ./best_model_20241206_194927/config.json


New best model found! Saving to ./best_model_20241206_194927


Model weights saved in ./best_model_20241206_194927/pytorch_model.bin
tokenizer config file saved in ./best_model_20241206_194927/tokenizer_config.json
Special tokens file saved in ./best_model_20241206_194927/special_tokens_map.json


Best model saved at: ./best_model_20241206_194927


In [None]:
##### Only run this if not training the model. Used to load it

model_path = "./best_model_20241206_194927"
model = BertForTokenClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

In [19]:
from nltk.corpus import wordnet as wn

def get_synset_details(word, synset_name):
    """
    Fetch details of the synset: definition and examples.

    Args:
        word (str): The word to find the synset for.
        synset_name (str): The WordNet synset name (e.g., 'dog.n.01').

    Returns:
        dict: Synset details including definition and examples.
    """
    try:
        synset = wn.synset(synset_name)
        return {
            "word": word,
            "definition": synset.definition(),
            "examples": synset.examples()
        }
    except Exception as e:
        print(f"Error fetching synset details: {e}")
        return None


In [20]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

def predict_synsets(sentence, model, tokenizer, id_to_synset, max_length=128):
    """
    Predict WordNet synsets for each word in a given sentence using the fine-tuned model.
    """
    # Determine the device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to the device

    # Tokenize the input sentence
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length,
        is_split_into_words=False
    )
    
    # Move inputs to the device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = logits.argmax(dim=-1).squeeze().tolist()

    # Map predictions to WordNet synsets
    input_ids = inputs['input_ids'].squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    word_ids = tokenizer.convert_ids_to_tokens(input_ids, skip_special_tokens=True)

    # Handle wordpiece alignment
    predicted_synsets = []
    for token_id, pred in zip(word_ids, predictions):
        if pred in id_to_synset:
            predicted_synsets.append(id_to_synset[pred])
        else:
            predicted_synsets.append("UNK")

    # Aggregate results by word
    tokenized_sentence = tokenizer.tokenize(sentence)
    aggregated_results = []
    for token, synset in zip(tokenized_sentence, predicted_synsets):
        aggregated_results.append((token, synset))

    return aggregated_results


In [21]:
# sentence = input("Enter a sentence: Everything happens for its own good.")
# sentence = "I was delighted to find money in the bank"
sentence = "Everything happens for its own good."

# sentence = "We saw ducks near the bank"


word = "own"


# Predict synsets
predictions = predict_synsets(sentence, model, tokenizer, id_to_synset)


# Display results|
print("\nPredicted Synsets:")
for token, synset in predictions:
    print(f"{token}: {synset}")

print(predictions)
    
print("\nPredicted Synsets and Definitions:")
for token, synset_name in predictions:
    if(token == word):
        definition = get_synset_details(token, synset_name)
        print(definition)
    # else:
    #     print("No synset found")
    # if synset_name != "UNK":  # If the synset is valid
    #     definition = get_synset_details(token, synset_name)
    #     print(f"{token}: {synset_name} - {definition}")
    # else:
    #     print(f"{token}: {synset_name} - No synset found")


Predicted Synsets:
everything: liquid.a.01
happens: praise.n.01
for: liquid.a.01
its: liquid.a.01
own: liquid.a.01
good: liquid.a.01
.: liquid.a.01
[('everything', 'liquid.a.01'), ('happens', 'praise.n.01'), ('for', 'liquid.a.01'), ('its', 'liquid.a.01'), ('own', 'liquid.a.01'), ('good', 'liquid.a.01'), ('.', 'liquid.a.01')]

Predicted Synsets and Definitions:
{'word': 'own', 'definition': 'existing as or having characteristics of a liquid; especially tending to flow', 'examples': ['water and milk and blood are liquid substances']}
