In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

##Token classification


This generic task encompasses any problem that can be formulated as “attributing a label to each token in a sentence,” such as:

**Named entity recognition (NER)**: Find the entities (such as persons, locations, or organizations) in a sentence. This can be formulated as attributing a label to each token by having one class per entity and one class for “no entity.”

**Part-of-speech tagging (POS)**: Mark each word in a sentence as corresponding to a particular part of speech (such as noun, verb, adjective, etc.).

**Chunking**: Find the tokens that belong to the same entity. This task (which can be combined with POS or NER) can be formulated as attributing one label (usually B-) to any tokens that are at the beginning of a chunk, another label (usually I-) to tokens that are inside a chunk, and a third label (usually O) to tokens that don’t belong to any chunk.



- O means the word doesn’t correspond to any entity.
- B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.
- B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.
- B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.
- B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.

In [None]:
# Install
!pip install transformers datasets tokenizers accelerate seqeval -q

# 1. DATASET:

Data link: https://huggingface.co/datasets/conll2003

In [None]:
import numpy as np
import pandas as pd

In [None]:
import datasets

from transformers import BertTokenizerFast
data = datasets.load_dataset("conll2003")

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
data.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [None]:
data['train'].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [None]:
tags = data["train"].features["ner_tags"].feature.names
tags

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
TAG2IDX = dict(zip(tags, range(len(tags))))
IDX2TAG = dict(zip(range(len(tags)), tags))

In [None]:
def f_dataset_row(idx):

    toks = data["train"][idx]['tokens']
    idx = data["train"][idx]['ner_tags']
    tags = [IDX2TAG[j] for j in idx]

    df = pd.DataFrame().assign(toks = toks, idx = idx, tags = tags).set_index('toks').T
    return df

In [None]:
idx = 0
f_dataset_row(idx)

toks,EU,rejects,German,call,to,boycott,British,lamb,.
idx,3,0,7,0,0,0,7,0,0
tags,B-ORG,O,B-MISC,O,O,O,B-MISC,O,O


In [None]:
idx = 1111
f_dataset_row(idx)

toks,Wednesday,'s,U.S.,Open,draw,ceremony,revealed,that,both,title,...,into,their,first,serious,opposition,in,the,third,round,.
idx,0,0,7,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tags,O,O,B-MISC,I-MISC,O,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O


# 2. TOKENIZER:

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

#### PRE & POST TOKENIZATION:

In [None]:
input = data['train'][0]['tokens']
tokenized_input = tokenizer(input, is_split_into_words=True)
# "is_split_into_words" kwarg above is to covey to tokenizer that the input is
# already split into words and hene there is no need to perform that step again

print(f'WORDS: {input}')
print()
print(f'TOKENS: {tokenized_input.tokens()}')

WORDS: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

TOKENS: ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']


#### WORD-IDs:

In [None]:
print(f'TOKENS: {tokenized_input.tokens()}')
print()
print(f'WORD_IDs: {tokenized_input.word_ids()}')

TOKENS: ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']

WORD_IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [None]:
print(f'WORD_IDs: {tokenized_input.word_ids()}')
print()
print(f'WORDS: {input}')
print()

WORD_IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

WORDS: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']



In [None]:
len(input), len(tokenized_input["input_ids"])

(9, 11)

# ALIGNING WORD-IDs AND LABELS:
- During training, we will be training on the list of "word_id-label" pairs for each row.  
- Since word_ids have the extra "None" value corresponding to non-word/special tokens,  
we have to align the labels such that this is accounted for.

### WE DO THIS BY:
- setting –100 as the label for these special tokens and the subwords we wish to mask during training.
- mask the subword representations after the first subword.   
(For cases where a word is split up into subwords by the tokenizer)

# FUNCTION FOR ALIGNING LABELS WITH WOR_IDs:

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
data['train'][0:1]['id']

['0']

In [None]:
for i in data['train'][0:1]:
    print(f"""{i}: {data['train'][0:1][i]}""")

id: ['0']
tokens: [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']]
pos_tags: [[22, 42, 16, 21, 35, 37, 16, 21, 7]]
chunk_tags: [[11, 21, 11, 12, 21, 22, 11, 12, 0]]
ner_tags: [[3, 0, 7, 0, 0, 0, 7, 0, 0]]


In [None]:
after_alignement = tokenize_and_align_labels(data['train'][0:1])

for i in after_alignement:
    print(f'{i} : {after_alignement[i]}')

input_ids : [[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
labels : [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]]


So before applying the tokenize_and_align_labels() the tokenized_input has 3 keys
- input_ids
- token_type_ids
- attention_mask

But after applying tokenize_and_align_labels() we have an extra key - 'labels'

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(after_alignement["input_ids"][0]),after_alignement["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
eu______________________________________ 3
rejects_________________________________ 0
german__________________________________ 7
call____________________________________ 0
to______________________________________ 0
boycott_________________________________ 0
british_________________________________ 7
lamb____________________________________ 0
._______________________________________ 0
[SEP]___________________________________ -100


In [None]:
## Applying on entire data
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)

In [None]:
## Applying on entire data
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)

In [None]:
for i in tokenized_datasets['train'][0]:
    print(f'{i}: {tokenized_datasets["train"][0][i]}')
    print()

input_ids: [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]

token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

labels: [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]



# Defining model

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Define training args

In [None]:
from transformers import TrainingArguments


args = TrainingArguments(
"bert-base-uncased",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = datasets.load_metric("seqeval")

  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

### Lets test the metrix on an example

In [None]:
example = data['train'][0]

In [None]:
label_list = data["train"].features["ner_tags"].feature.names

label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
for i in example["ner_tags"]:
  print(i)

3
0
7
0
0
0
7
0
0


In [None]:
labels = [label_list[i] for i in example["ner_tags"]]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
metric.compute(predictions=[labels], references=[labels])

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

###Compute Metrics
This compute_metrics() function first takes the argmax of the logits to convert them to predictions (as usual, the logits and the probabilities are in the same order, so we don’t need to apply the softmax). Then we have to convert both labels and predictions from integers to strings. We remove all the values where the label is -100, then pass the results to the metric.compute() method:

In [None]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

## Training

In [None]:
from transformers import Trainer

trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2213,0.060833,0.917476,0.930305,0.923846,0.982414
2,0.0449,0.058382,0.935316,0.943059,0.939171,0.985448
3,0.0256,0.055579,0.936368,0.948204,0.942249,0.986306


TrainOutput(global_step=2634, training_loss=0.0767435817530233, metrics={'train_runtime': 499.0988, 'train_samples_per_second': 84.398, 'train_steps_per_second': 5.278, 'total_flos': 1020143109346326.0, 'train_loss': 0.0767435817530233, 'epoch': 3.0})

# Save

In [None]:
## Save model
model.save_pretrained("ner_model")

Configuration saved in ner_model/config.json
Model weights saved in ner_model/pytorch_model.bin


In [None]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

tokenizer config file saved in tokenizer/tokenizer_config.json
Special tokens file saved in tokenizer/special_tokens_map.json


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
id2label

{'0': 'O',
 '1': 'B-PER',
 '2': 'I-PER',
 '3': 'B-ORG',
 '4': 'I-ORG',
 '5': 'B-LOC',
 '6': 'I-LOC',
 '7': 'B-MISC',
 '8': 'I-MISC'}

In [None]:
label2id

{'O': '0',
 'B-PER': '1',
 'I-PER': '2',
 'B-ORG': '3',
 'I-ORG': '4',
 'B-LOC': '5',
 'I-LOC': '6',
 'B-MISC': '7',
 'I-MISC': '8'}

## Loading model & prediction

In [None]:
import json

In [None]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

loading configuration file ner_model/config.json
Model config BertConfig {
  "_name_or_path": "ner_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": "5",
    "B-MISC": "7",
    "B-ORG": "3",
    "B-PER": "1",
    "I-LOC": "6",
    "I-MISC": "8",
    "I-ORG": "4",
    "I-PER": "2",
    "O": "0"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "tr

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Bill Gates is the Founder of Microsoft"

ner_results = nlp(example)

print(ner_results)

[{'entity': 'B-PER', 'score': 0.9974279, 'index': 1, 'word': 'bill', 'start': 0, 'end': 4}, {'entity': 'I-PER', 'score': 0.99712795, 'index': 2, 'word': 'gates', 'start': 5, 'end': 10}, {'entity': 'B-ORG', 'score': 0.95886695, 'index': 7, 'word': 'microsoft', 'start': 29, 'end': 38}]


Reference: https://huggingface.co/course/chapter7/2