In [None]:
!pip install evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m707.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4

In [None]:
import os

import evaluate
import numpy as np
import regex as re
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import pipeline

os.environ["WANDB_DISABLED"] = "true"


def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

  labels = []
  for i, label in enumerate(examples[f"ner_tags"]):
      word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:  # Set the special tokens to -100.
          if word_idx is None:
              label_ids.append(-100)
          elif word_idx != previous_word_idx:  # Only label the first token of a given word.
              label_ids.append(label[word_idx])
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx
      labels.append(label_ids)

  tokenized_inputs["labels"] = labels
  return tokenized_inputs

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }


# Prepare data

In [None]:
from datasets import Dataset, DatasetDict

id2label = {
  0: 'O',
  1: 'B-Badger',
  2: 'B-Bat',
  3: 'B-Bear',
  4: 'B-Bee',
  5: 'B-Beetle',
  6: 'B-Bison',
  7: 'B-Boar',
  8: 'B-Butterfly',
  9: 'B-Cat',
  10: 'B-Caterpillar',
  11: 'B-Antelope',
}

label2id = {v: i for i, v in id2label.items()}

train_data = [
    {"id": "0", "tokens": ["Look", "at", "that", "antelope", "running"], "ner_tags": [0, 0, 0, 11, 0]},
    {"id": "1", "tokens": ["A", "badger", "is", "digging", "a", "hole"], "ner_tags": [0, 1, 0, 0, 0, 0]},
    {"id": "2", "tokens": ["I", "think", "that", "is", "a", "bat"], "ner_tags": [0, 0, 0, 0, 0, 2]},
    {"id": "3", "tokens": ["The", "bear", "is", "climbing", "the", "tree"], "ner_tags": [0, 3, 0, 0, 0, 0]},
    {"id": "4", "tokens": ["Watch", "out", "for", "the", "bee"], "ner_tags": [0, 0, 0, 0, 4]},
    {"id": "5", "tokens": ["I", "saw", "a", "shiny", "beetle", "today"], "ner_tags": [0, 0, 0, 0, 5, 0]},
    {"id": "6", "tokens": ["The", "bison", "herd", "is", "huge"], "ner_tags": [0, 6, 0, 0, 0]},
    {"id": "7", "tokens": ["Could", "that", "be", "a", "wild", "boar"], "ner_tags": [0, 0, 0, 0, 0, 7]},
    {"id": "8", "tokens": ["I", "love", "watching", "butterflies", "in", "spring"], "ner_tags": [0, 0, 0, 8, 0, 0]},
    {"id": "9", "tokens": ["My", "cat", "is", "sleeping", "again"], "ner_tags": [0, 9, 0, 0, 0]},
    {"id": "10", "tokens": ["That", "caterpillar", "will", "turn", "into", "a", "butterfly"], "ner_tags": [0, 10, 0, 0, 0, 0, 8]},
    {"id": "11", "tokens": ["Is", "that", "an", "antelope", "in", "the", "distance"], "ner_tags": [0, 0, 0, 11, 0, 0, 0]},
    {"id": "12", "tokens": ["A", "badger", "just", "crossed", "the", "road"], "ner_tags": [0, 1, 0, 0, 0, 0]},
    {"id": "13", "tokens": ["I", "heard", "a", "bat", "flying", "nearby"], "ner_tags": [0, 0, 0, 2, 0, 0]},
    {"id": "14", "tokens": ["The", "bear", "left", "paw", "prints"], "ner_tags": [0, 3, 0, 0, 0]},
    {"id": "15", "tokens": ["Be", "careful", "of", "that", "bee"], "ner_tags": [0, 0, 0, 0, 4]},
    {"id": "16", "tokens": ["This", "beetle", "looks", "so", "colorful"], "ner_tags": [0, 5, 0, 0, 0]},
    {"id": "17", "tokens": ["The", "bison", "moved", "slowly", "across", "the", "field"], "ner_tags": [0, 6, 0, 0, 0, 0, 0]},
    {"id": "18", "tokens": ["Did", "you", "see", "that", "boar", "near", "the", "trees"], "ner_tags": [0, 0, 0, 0, 7, 0, 0, 0]},
    {"id": "19", "tokens": ["A", "butterfly", "landed", "on", "my", "hand"], "ner_tags": [0, 8, 0, 0, 0, 0]},
    {"id": "20", "tokens": ["The", "caterpillar", "is", "crawling", "on", "the", "leaf"], "ner_tags": [0, 10, 0, 0, 0, 0, 0]}
]

validation_data = [
    {"id": "1", "tokens": ["Is", "that", "an", "antelope"], "ner_tags": [0, 0, 0, 11]},
    {"id": "2", "tokens": ["A", "badger", "is", "here"], "ner_tags": [0, 1, 0, 0]},
    {"id": "3", "tokens": ["I", "see", "a", "bat", "here"], "ner_tags": [0, 0, 0, 2, 0]},
    {"id": "4", "tokens": ["The", "bear", "is", "here"], "ner_tags": [0, 3, 0, 0]},
    {"id": "5", "tokens": ["It", "looks", "like", "a", "bee"], "ner_tags": [0, 0, 0, 0, 4]},
    {"id": "6", "tokens": ["This", "beetle", "looks", "nice"], "ner_tags": [0, 5, 0, 0]},
    {"id": "7", "tokens": ["The", "bison", "in", "the", "picture"], "ner_tags": [0, 6, 0, 0, 0]},
    {"id": "8", "tokens": ["Did", "you", "see", "that", "boar"], "ner_tags": [0, 0, 0, 0, 7]},
    {"id": "9", "tokens": ["A", "butterfly", "captured", "here"], "ner_tags": [0, 8, 0, 0]},
    {"id": "10", "tokens": ["The", "caterpillar", "is", "photographed", "here"], "ner_tags": [0, 10, 0, 0, 0]}
]

test_data = [
    {"id": "0", "tokens": ["The", "antelope", "grazed", "on", "the", "grass"], "ner_tags": [0, 11, 0, 0, 0, 0]},
    {"id": "1", "tokens": ["A", "badger", "dug", "under", "the", "fence"], "ner_tags": [0, 1, 0, 0, 0, 0]},
    {"id": "2", "tokens": ["The", "bat", "flapped", "its", "wings", "in", "the", "dark"], "ner_tags": [0, 2, 0, 0, 0, 0, 0, 0]},
    {"id": "3", "tokens": ["A", "bear", "was", "wandering", "through", "the", "forest"], "ner_tags": [0, 3, 0, 0, 0, 0, 0]},
    {"id": "4", "tokens": ["A", "bee", "buzzed", "around", "the", "flowers"], "ner_tags": [0, 4, 0, 0, 0, 0]},
    {"id": "5", "tokens": ["The", "beetle", "climbed", "up", "the", "tree"], "ner_tags": [0, 5, 0, 0, 0, 0]},
    {"id": "6", "tokens": ["The", "bison", "roamed", "the", "open", "plains"], "ner_tags": [0, 6, 0, 0, 0, 0]},
    {"id": "7", "tokens": ["The", "boar", "ran", "into", "the", "woods"], "ner_tags": [0, 7, 0, 0, 0, 0]},
    {"id": "8", "tokens": ["A", "butterfly", "fluttered", "by", "the", "pond"], "ner_tags": [0, 8, 0, 0, 0, 0]},
    {"id": "9", "tokens": ["The", "cat", "sat", "on", "the", "windowsill"], "ner_tags": [0, 9, 0, 0, 0, 0]},
    {"id": "10", "tokens": ["A", "caterpillar", "was", "eating", "a", "leaf"], "ner_tags": [0, 10, 0, 0, 0, 0]},
]


In [None]:
def split_sentence_with_punctuation(sentence):
    # Use regular expression to split on spaces but keep punctuation as separate tokens
    tokens = re.findall(r'\w+|[?!.]', sentence)
    return tokens


def build_data(id2label):
    data = []
    animals = [x[2:].lower() for x in id2label.values() if x != 'O']

    for idx, animal in enumerate(animals):
        sentence_templates = [
            f"I see {animal} here",
            f"There's {animal} in the picture",
            f"Do we have {animal} here?",
            f"Is there {animal}?",
            f"This picture has image of {animal}",
            f"Nice picture. It is {animal}, right?",
            f"Is it {animal}?",
            f"{animal} looks cool here",
            f"Funny to see {animal} here",
            f"Look, what a beautiful {animal}!",
            f"{animal} looks very cool here, right?",
            f"I like this picture. It has my favorite animal - {animal}.",
            f"Is that {animal}?",
            f"Looks like {animal} to me, right?",
            f"It has this funny looking {animal}",
            f"There's {animal} in this pic",
            f"Does it has {animal}?",
            f"Picture has {animal}",
            f"There is {animal} in the picture",
        ]

        # Process each sentence
        for sentence in sentence_templates:
            tokens = split_sentence_with_punctuation(sentence)
            ner_tags = [0] * len(tokens)  # Initialize ner_tags with 'O'

            # Find the position of the animal and assign appropriate NER tag
            ner_tags[tokens.index(animal)] = idx + 1  # Using 1-based index from id2label

            # Create data dictionary for this sentence
            data.append({
                "id": str(idx),
                "tokens": tokens,
                "ner_tags": ner_tags
            })

    return data

# Build the data
data = build_data(id2label)

train_data += data
len(train_data)


230

In [None]:
# Convert to Hugging Face datasets
train_dataset = Dataset.from_list(train_data)
validation_dataset = Dataset.from_list(validation_data)
test_dataset = Dataset.from_list(test_data)

# Combine into DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 230
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 10
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 11
    })
})
{'id': '0', 'tokens': ['Look', 'at', 'that', 'antelope', 'running'], 'ner_tags': [0, 0, 0, 11, 0]}


In [None]:
label_list = list(id2label.values())
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

seqeval = evaluate.load("seqeval")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

model_path = "animals_model"

training_args = TrainingArguments(
    output_dir=model_path,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.963458,0.0,0.0,0.0,0.84058
2,No log,0.684886,0.0,0.0,0.0,0.84058
3,No log,0.458192,1.0,0.181818,0.307692,0.869565
4,No log,0.374246,1.0,0.272727,0.428571,0.884058
5,No log,0.347813,1.0,0.454545,0.625,0.913043


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=75, training_loss=0.7054263814290365, metrics={'train_runtime': 187.0257, 'train_samples_per_second': 6.149, 'train_steps_per_second': 0.401, 'total_flos': 3788604075744.0, 'train_loss': 0.7054263814290365, 'epoch': 5.0})

In [None]:
trainer.save_model(model_path)


In [None]:
# Load the pipeline
classifier = pipeline("ner", model=model_path, tokenizer=model_path)
classifier.model.eval()

# Inference
text = "There's a badger in this picture"
results = classifier(text)
results


Device set to use cpu


[{'entity': 'B-Badger',
  'score': 0.16518706,
  'index': 5,
  'word': 'badger',
  'start': 10,
  'end': 16}]