In [12]:
!pip install datasets



In [13]:
import json

#list of animals
animals = ["butterfly", "cat", "chicken", "cow", "dog", "elephant", "horse", "sheep", "spider", "squirrel"]

#data generator
def generate_sentences(animals):
    sentences = []

    #templates
    sentence_templates = [
        "There is a {} in the field.",
        "I saw a {} near the barn.",
        "A wild {} was spotted in the forest.",
        "Look at that {}!",
        "I have a {} as a pet.",
        "The {} is playing in the yard.",
        "Do you see the {} over there?",
        "A {} is running fast."
    ]

    for animal in animals:
        for template in sentence_templates:
            sentence_type = template.format(animal)
            tokens = sentence_type.split()
            ner_tags = [0] * len(tokens)
            for i, token in enumerate(tokens):
                if token.lower() == animal:
                    ner_tags[i] = 1
            sentences.append({"tokens": tokens, "ner_tags": ner_tags})

    return sentences

data = generate_sentences(animals)

#saving
with open('ner_dataset.json', 'w') as f:
    json.dump(data, f, indent=2)

print("Dataset generated and saved in 'ner_dataset.json'")

Dataset generated and saved in 'ner_dataset.json'


In [14]:
import json
import random
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

#load data
with open('ner_dataset.json') as f:
    data = json.load(f)

random.shuffle(data)

#80% train, 20% eval
split_index = int(0.8 * len(data))
train_data = data[:split_index]
eval_data = data[split_index:]

#create dataset
datasets = DatasetDict({
    "train": Dataset.from_list(train_data),
    "eval": Dataset.from_list(eval_data)
})

#load bert
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, padding='max_length', is_split_into_words=True, max_length=20)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(labels[-1])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#tokenization
tokenized_datasets = datasets.map(tokenize_and_align_labels)

#model init
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=2)

training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
)

#training
trainer.train()

#saving
torch.save(model.state_dict(), 'ner_model.pth')
torch.save(tokenizer, 'ner_tokenizer.pth')

print("model saved")

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.070748
2,0.228300,0.000774
3,0.011400,0.000217
4,0.000900,0.000133
5,0.000300,0.000119


model saved


In [15]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load model
def load_ner_model(model_path, tokenizer_path):
    model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=2)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    tokenizer = torch.load(tokenizer_path)

    return model, tokenizer

def extract_animals(text, model, tokenizer):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens).logits
    predictions = torch.argmax(outputs, dim=2)
    print(predictions)

    tokenized_text = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
    print(tokenized_text)

    animals = [tokenized_text[i] for i, label in enumerate(predictions[0]) if label == 1]
    return animals

if __name__ == "__main__":
    model_path = "ner_model.pth"
    tokenizer_path = "ner_tokenizer.pth"

    model, tokenizer = load_ner_model(model_path, tokenizer_path)

    text = "There is a cow in the picture."
    animals = extract_animals(text, model, tokenizer)

    if animals:
        print("Animals found:", animals)
    else:
        print("No animals found.")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))


tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])
['[CLS]', 'There', 'is', 'a', 'cow', 'in', 'the', 'picture', '.', '[SEP]']
Animals found: ['cow']


  tokenizer = torch.load(tokenizer_path)
