In [1]:
from nltk.corpus import wordnet as wn

def generate_prompt_file_with_examples(input_file, output_file="prompt_file_with_examples.txt"):
    """
    Read an input file with words and their synsets, retrieve meanings and example sentences,
    and generate a new prompt file with word, synset, definition, and example sentence.
    """
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            
            word, synset_name = line.strip().split(",")
            try:
            
                synset = wn.synset(synset_name)
                # Retrieve the definition
                definition = synset.definition()
                # Retrieve an example sentence or provide a placeholder
                examples = synset.examples()
                example_sentence = examples[0] if examples else f"No example available for {word}."
                # Write the word, synset, definition, and example sentence to the output file
                outfile.write(f"{word},{synset_name},{definition},{example_sentence}\n")
            except Exception as e:
                print(f"Error processing {word}, {synset_name}: {e}")
    print(f"Enhanced prompt file generated at: {output_file}")


<h4>Generating prompt file using input file</h4>

In [2]:
input_file = "input_file.txt"
generate_prompt_file_with_examples(input_file)


Enhanced prompt file generated at: prompt_file_with_examples.txt


<h4> Retrieving more unique examples from wordnet and appending them to inpput file </h4>

In [3]:

import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn

# Generate  examples from WordNet
output_file = "prompt_file_append.txt"

unique_words = set()  # To ensure uniqueness
with open(output_file, "w") as file:
    for synset in list(wn.all_synsets('n')): 
        if len(unique_words) >= 1000: #we can change the number for the number of examples
            break
        word = synset.lemmas()[0].name()
        if word in unique_words:
            continue
        unique_words.add(word)
        definition = synset.definition()
        example_sentence = synset.examples()[0] if synset.examples() else f"No example available for {word}."
        synset_id = synset.name()
        line = f"{word},{synset_id},{definition},{example_sentence}\n"
        file.write(line)



[nltk_data] Downloading package wordnet to /home/szele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<h4>Model fine tuning below</h4>

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Initialize BART Pretrained Model and Tokenizer
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def load_prompt_file_with_examples(prompt_file):
    """
    Load the enhanced prompt file and convert it into a dataset.

    Args:
        prompt_file (str): Path to the enhanced prompt file.

    Returns:
        Dataset: Hugging Face Dataset containing input-output pairs.
    """
    prompts = []
    with open(prompt_file, "r") as file:
        for line_num, line in enumerate(file, start=1):
            try:
                # Ensure splitting works for four fields
                word, synset_name, definition, example_sentence = line.strip().split(",", maxsplit=3)
                input_text = (
                    f"Generate sentences using the word '{word}' in the sense of '{synset_name}', "
                    f"which means: {definition}."
                )
                output_text = example_sentence
                prompts.append({"input_text": input_text, "output_text": output_text})
            except ValueError as e:
                print(f"Skipping line {line_num}: {line.strip()} - {e}")
    print(f"Loaded {len(prompts)} valid prompts from the file.")
    return Dataset.from_list(prompts)



def preprocess_dataset(dataset):
    """
    Preprocess the dataset for BART fine-tuning.

    Args:
        dataset (Dataset): Hugging Face Dataset containing input-output pairs.

    Returns:
        Dataset: Tokenized dataset.
    """
    def preprocess(batch):
        inputs = tokenizer(
            batch["input_text"], truncation=True, padding="max_length", max_length=512
        )
        outputs = tokenizer(
            batch["output_text"], truncation=True, padding="max_length", max_length=512
        )
        batch["input_ids"] = inputs["input_ids"]
        batch["attention_mask"] = inputs["attention_mask"]
        batch["labels"] = outputs["input_ids"]
        return batch

    return dataset.map(preprocess, batched=True)

def fine_tune_bart(dataset, output_dir="bart_finetuned", epochs=1):
    """
    Fine-tune the BART model.

    Args:
        dataset (Dataset): Tokenized dataset for training.
        output_dir (str): Directory to save the fine-tuned model.
        epochs (int): Number of epochs for training.

    Returns:
        BartForConditionalGeneration: The fine-tuned model.
    """
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=4,
        save_steps=500,
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=100,
        learning_rate=5e-4,
        save_total_limit=2,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=dataset,
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model


prompt_file = "prompt_file_append.txt"
output_dir = "bart_finetuned"

#Load and preprocess the dataset
dataset = load_prompt_file_with_examples(prompt_file)
print(f"Dataset size: {len(dataset)}")
print(dataset[0])  # Inspect the first input-output pair

tokenized_dataset = preprocess_dataset(dataset)
print(f"Tokenized dataset size: {len(tokenized_dataset)}")
print(tokenized_dataset[0])  # Inspect a tokenized pair

# Fine-tune the model
fine_tuned_model = fine_tune_bart(tokenized_dataset, output_dir=output_dir, epochs=1)


Loaded 1000 valid prompts from the file.
Dataset size: 1000
{'input_text': "Generate sentences using the word 'entity' in the sense of 'entity.n.01', which means: that which is perceived or known or inferred to have its own distinct existence (living or nonliving).", 'output_text': 'No example available for entity.'}


Map: 100%|██████████| 1000/1000 [00:00<00:00, 2193.98 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Tokenized dataset size: 1000
{'input_text': "Generate sentences using the word 'entity' in the sense of 'entity.n.01', which means: that which is perceived or known or inferred to have its own distinct existence (living or nonliving).", 'output_text': 'No example available for entity.', 'input_ids': [0, 40025, 877, 11305, 634, 5, 2136, 128, 46317, 108, 11, 5, 1472, 9, 128, 46317, 4, 282, 4, 2663, 3934, 61, 839, 35, 14, 61, 16, 9568, 50, 684, 50, 42870, 7, 33, 63, 308, 11693, 8066, 36, 26111, 50, 786, 26111, 322, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

Step,Training Loss,Validation Loss
100,1.6187,4.859058


<h4>Sentence generation code</h4>

In [5]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the fine-tuned model
fine_tuned_model_dir = "bart_finetuned"
tokenizer = BartTokenizer.from_pretrained(fine_tuned_model_dir)
model = BartForConditionalGeneration.from_pretrained(fine_tuned_model_dir)

def generate_sentences_from_model(word, synset_name, definition, num_sentences=3):
    """
    Generate sentences using the fine-tuned BART model.

    Args:
        word (str): The target word.
        synset_name (str): The WordNet synset name.
        definition (str): The definition of the word in the given synset.
        num_sentences (int): Number of sentences to generate.

    Returns:
        list: A list of generated sentences.
    """
    # Prepare the input prompt
    prompt = (
        f"Generate {num_sentences} distinct sentences using the word '{word}' "
        f"in the sense of '{synset_name}', which means: {definition}. "
        "Make sure each sentence is simple and unique."
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate sentences
    outputs = model.generate(
        inputs["input_ids"],
        max_length=200,
        num_beams=5,
        num_return_sequences=num_sentences,
        do_sample=True,  # Enable sampling
        temperature=0.7,  # Adjust randomness
        top_k=50,  # Nucleus sampling
        top_p=0.95,  # Nucleus sampling
    )

    # Decode and return sentences
    sentences = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return sentences

# Function to test the model with a single word and synset
def test_model(word, synset_name, definition, num_sentences=3):
    print(f"Testing model for word: '{word}' and synset: '{synset_name}'")
    print(f"Definition: {definition}")
    print("\nGenerated Sentences:")
    
    sentences = generate_sentences_from_model(word, synset_name, definition, num_sentences)
    for i, sentence in enumerate(sentences, 1):
        print(f"{i}. {sentence}")


openi api code

In [6]:
from nltk.corpus import wordnet as wn

def get_synset_details(word, synset_name):
    """
    Fetch details of the synset: definition and examples.

    Args:
        word (str): The word to find the synset for.
        synset_name (str): The WordNet synset name (e.g., 'dog.n.01').

    Returns:
        dict: Synset details including definition and examples.
    """
    try:
        synset = wn.synset(synset_name)
        return {
            "word": word,
            "definition": synset.definition(),
            "examples": synset.examples()
        }
    except Exception as e:
        print(f"Error fetching synset details: {e}")
        return None


In [7]:
# Test the model with the word "dog"
test_model(
    word="dog",
    synset_name="dog.n.01",
    definition="A member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds.",
    num_sentences=3
)


Testing model for word: 'dog' and synset: 'dog.n.01'
Definition: A member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds.

Generated Sentences:
1. No. partisan. Jew Jew. Jew. Fed. Jew extra. Jew host.. extra.. Jew. freeze. Jew..... externalToEVA.chieve..","...chieve.tained.Coun.tained. externalToEVACoun.Coun..roud externalToEVA..Coun. host. externalToEVA.ARGET.roud."},. innings.ention.DS.ulpt...CounCoun‐tained hostCoun"},.chieve.tained..paioroud. hostactionDate."}, inningsmbudsmanactionDatechievenesium..assed Jew.isure...","antaention.assed.
2. No. partisan. Jew Jew. Jew. Fed. Jew extra. Jew host.. extra.. Jew. freeze. Jew..... externalToEVA.chieve..","...chieve.tained.Coun.tained. externalToEVACoun.Coun..roud externalToEVA..Coun. host. externalToEVA.ARGET.roud."},. innings.ention.DS.ulpt...CounCoun‐tained hostCoun"},.chieve.tained..paioroud. hostactionDate."}, inningsmbudsmanactionDatechieve

In [None]:
import openai
from nltk.corpus import wordnet as wn


openai.api_key = ""
def get_synset_details(word, synset_name):
    """
    Fetch details of the synset: definition and examples.

    Args:
        word (str): The word to find the synset for.
        synset_name (str): The WordNet synset name (e.g., 'dog.n.01').

    Returns:
        dict: Synset details including definition and examples.
    """
    try:
        synset = wn.synset(synset_name)
        return {
            "word": word,
            "definition": synset.definition(),
            "examples": synset.examples() if synset.examples() else [f"No example available for {word}."]
        }
    except Exception as e:
        print(f"Error fetching synset details: {e}")
        return None

def generate_sentences(word, synset_name, num_sentences=3):
    """
    Generate sentences using GPT-4 based on a word's WordNet synset meaning.

    Args:
        word (str): The target word.
        synset_name (str): The WordNet synset name.
        num_sentences (int): Number of sentences to generate.

    Returns:
        list: A list of generated sentences.
    """
    # Fetch synset details
    details = get_synset_details(word, synset_name)
    if not details:
        return [f"Error: Synset details not found for {word}, {synset_name}."]

    # Create a structured prompt
    prompt = (
        f"Generate {num_sentences} distinct sentences using the word '{word}' in the sense of '{synset_name}', "
        f"which means: {details['definition']}. "
        "Here are some examples to guide you: "
        f"{' '.join(details['examples'])} "
        "Make sure each sentence is simple, unique, and reflects this meaning."
    )

    # GPT-4 API with structured prompts
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an assistant specializing in creating contextually accurate sentences."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=300,
        n=num_sentences
    )

    sentences = [choice["message"]["content"] for choice in response["choices"]]
    return sentences

# Example usage
word = "dog"
synset_name = "dog.n.01"
generated_sentences = generate_sentences(word, synset_name, num_sentences=3)

print("\n".join(generated_sentences))


1. My neighbor's dog, a large golden retriever, loves to play fetch in the park.
2. I adopted a small, stray dog that I found wandering in my neighborhood.
3. The little girl was afraid of the big, black dog that lived next door.
1. The small, fluffy dog wagged its tail excitedly as its owner approached.
2. The Labrador, a popular breed of dog, is known for its friendly and outgoing nature.
3. The dog dug a hole in the backyard, burying its favorite toy for safekeeping.
1. The small child giggled as the playful dog licked her face.
2. The loyal dog stayed by its owner's side even during the storm.
3. After a long day at work, the man was greeted by the excited barks of his dog.
