<a href="https://colab.research.google.com/github/swrp-h/NER_with_transformers/blob/main/multinerd_ner_unfilt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning a transformer for named entity recognition (unfiltered original version)

This notebook is part of a project to finetune a transformer model for English NER. The original dataset utilised here is the English subset of [Multinerd Dataset by Babelscape](https://huggingface.co/datasets/Babelscape/multinerd).

The notebook contents have been inspired by the [official Hugging Face documentation](https://huggingface.co/docs/transformers/main/tasks/token_classification).

## Initial setup, installations, imports


In [None]:
# Uncomment if using Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MODEL_OP_DIRECTORY = "PATH_TO_MODEL_OUTPUT_DIRECTORY"
TF_MODEL = "distilbert-base-uncased"

In [None]:
# Optional for uploading your model:
# !python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('YOUR_HF_TOKEN')"

In [1]:
## Installations: Do not run this cell if not on Google Colab

!pip uninstall transformers --y
!pip install transformers==4.28.0
!pip install datasets
!pip install -U accelerate
!pip install evaluate
!pip install seqeval

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.0
    Uninstalling tokenizers-0.15.0:
      Successfully uninstalled tokenizers-0.15.0
Successfully installed tokenizers-0.13.3 transformers-4.28.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2

In [2]:
# Imports

import transformers
import accelerate
from transformers import AutoTokenizer
import evaluate
seqeval = evaluate.load("seqeval")
from datasets import load_dataset
from evaluate import evaluator
from datasets import load_dataset

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [3]:
transformers.__version__

'4.28.0'

## Initializing and Preprocessing the data


In [4]:
## Initializing the tokenizer
# Model used: DistilBERT for speed.

tokenizer = AutoTokenizer.from_pretrained(TF_MODEL)

def tokenize_and_align_labels(dataset):
    """
    Tokenize the input tokens and align the corresponding labels to the tokenized input.

    Args:
        dataset (Dataset): A dataset from the HF hub containing the input tokens and corresponding labels.

    Returns:
        Dataset: A tokenized version of the dataset containing the tokenized inputs with aligned labels.
    """
    tokenized_inputs = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(dataset[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
dataset_train = load_dataset("Babelscape/multinerd",data_files="train/train_en.jsonl",split="train")
dataset_val = load_dataset("Babelscape/multinerd",data_files="val/val_en.jsonl",split="train")
dataset_test = load_dataset("Babelscape/multinerd",data_files="test/test_en.jsonl",split="train")

tokenized_train = dataset_train.map(tokenize_and_align_labels, batched=True)
tokenized_val = dataset_val.map(tokenize_and_align_labels, batched=True)
tokenized_test = dataset_test.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/131280 [00:00<?, ? examples/s]

Map:   0%|          | 0/16410 [00:00<?, ? examples/s]

Map:   0%|          | 0/16454 [00:00<?, ? examples/s]

In [None]:
## Setup to dynamically pad tokens and labels

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Setup for Training

In [None]:
## Mapping ids and labels for training (labels derived from the Multinerd dataset card)

id2label = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }

label2id = {label: idx for idx, label in id2label.items()}

# Example usage
print(label2id)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-ANIM', 8: 'I-ANIM', 9: 'B-BIO', 10: 'I-BIO', 11: 'B-CEL', 12: 'I-CEL', 13: 'B-DIS', 14: 'I-DIS', 15: 'B-EVE', 16: 'I-EVE', 17: 'B-FOOD', 18: 'I-FOOD', 19: 'B-INST', 20: 'I-INST', 21: 'B-MEDIA', 22: 'I-MEDIA', 23: 'B-MYTH', 24: 'I-MYTH', 25: 'B-PLANT', 26: 'I-PLANT', 27: 'B-TIME', 28: 'I-TIME', 29: 'B-VEHI', 30: 'I-VEHI'}


In [None]:
label_list = [k for k,v in id2label.items()]

['O',
 'B-PER',
 'I-PER',
 'B-ORG',
 'I-ORG',
 'B-LOC',
 'I-LOC',
 'B-ANIM',
 'I-ANIM',
 'B-BIO',
 'I-BIO',
 'B-CEL',
 'I-CEL',
 'B-DIS',
 'I-DIS',
 'B-EVE',
 'I-EVE',
 'B-FOOD',
 'I-FOOD',
 'B-INST',
 'I-INST',
 'B-MEDIA',
 'I-MEDIA',
 'B-MYTH',
 'I-MYTH',
 'B-PLANT',
 'I-PLANT',
 'B-TIME',
 'I-TIME',
 'B-VEHI',
 'I-VEHI']

In [None]:
## Loading the model and specifying the number of labels and label mappings
## In this case, it is 31 for the total labels in Multinerd

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    TF_MODEL, num_labels=31, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

In [None]:
## Setup for evaluation after training

import numpy as np

labels = label_list

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Training and Evaluation

In [None]:
## Defining hyperparameters

training_args = TrainingArguments(
    output_dir=MODEL_OP_DIRECTORY,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    #push_to_hub=True,                 ## Uncomment if uploading model to HF
)

## Training

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/drive/MyDrive/rise_test/ner_models/my_ner_model is already a clone of https://huggingface.co/shrop/my_ner_model. Make sure you pull the latest changes with `repo.git_pull()`.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.043,0.039896,0.897047,0.937064,0.916619,0.985889
2,0.0328,0.036352,0.918198,0.933164,0.925621,0.987595


TrainOutput(global_step=16410, training_loss=0.049063013589360376, metrics={'train_runtime': 1734.7713, 'train_samples_per_second': 151.351, 'train_steps_per_second': 9.459, 'total_flos': 3739593910698048.0, 'train_loss': 0.049063013589360376, 'epoch': 2.0})

In [None]:
## Extra: Eval with another split

training_args = TrainingArguments(
    output_dir=MODEL_OP_DIRECTORY,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

{'eval_loss': 0.04901397228240967,
 'eval_precision': 0.8925420087470268,
 'eval_recall': 0.8933988710110979,
 'eval_f1': 0.8929702343255226,
 'eval_accuracy': 0.9827867792755356,
 'eval_runtime': 45.3827,
 'eval_samples_per_second': 361.592,
 'eval_steps_per_second': 22.608}