## install transformers

In [1]:
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! pip install transformers datasets evaluate seqeval


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import libraries


In [3]:
from datasets import load_dataset, load_metric, ClassLabel, Sequence
import random
import pandas as pd
import numpy as np
from IPython.display import display, HTML

import transformers
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast

from transformers import DataCollatorForTokenClassification




## Load the dataset

In [4]:
data = load_dataset('conll2003')

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

## We are using NER task for token classification



In [7]:
task = "ner"
model_checkpoint = "bert-base-uncased"
batch_size = 16

In [8]:
label = data['train'].features[f"{task}_tags"].feature.names
print(label)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


## just to see how dataset look like

In [9]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [10]:
show_random_elements(data['train'])

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,8916,"[The, SPLA, has, fought, Khartoum, 's, government, forces, in, the, south, since, 1983, for, greater, autonomy, or, independence, of, the, mainly, Christian, and, animist, region, from, the, Moslem, ,, Arabised, north, .]","[DT, NNP, VBZ, VBN, NN, POS, NN, NNS, IN, DT, JJ, IN, CD, IN, JJR, NN, CC, NN, IN, DT, RB, NNP, CC, JJ, NN, IN, DT, NNP, ,, NNP, RB, .]","[B-NP, I-NP, B-VP, I-VP, B-NP, B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, I-NP, I-NP, B-PP, B-NP, I-NP, I-NP, O, B-NP, I-NP, B-PP, B-NP, I-NP, O, B-NP, B-ADVP, O]","[O, B-ORG, O, O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-MISC, O, O, O, O, O, B-MISC, O, B-LOC, I-LOC, O]"
1,10363,"["", I, 'm, not, here, to, fight, the, press, or, talk, about, the, food, or, entertain, the, people, off, the, court, .]","["", PRP, VBP, RB, RB, TO, VB, DT, NN, CC, VB, IN, DT, NN, CC, VB, DT, NNS, IN, DT, NN, .]","[O, B-NP, B-VP, O, B-ADVP, B-VP, I-VP, B-NP, I-NP, O, B-VP, B-PP, B-NP, I-NP, O, B-VP, B-NP, I-NP, B-PP, B-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,6499,"[Result, :, Pakistan, won, by, 9, wickets]","[NN, :, NNP, VBD, IN, CD, NNS]","[B-NP, O, B-NP, B-VP, B-PP, B-NP, I-NP]","[O, O, B-LOC, O, O, O, O]"
3,2788,"[Yugoslavia, and, Croatia, were, poised, on, Friday, to, sign, a, landmark, normalisation, treaty, ending, five, years, of, tensions, and, paving, way, for, stabilisation, in, the, Balkans, .]","[NNP, CC, NNP, VBD, VBN, IN, NNP, TO, VB, DT, NN, NN, NN, VBG, CD, NNS, IN, NNS, CC, VBG, NN, IN, NN, IN, DT, NNS, .]","[B-NP, I-NP, I-NP, B-VP, I-VP, B-PP, B-NP, B-VP, I-VP, B-NP, I-NP, I-NP, I-NP, B-VP, B-NP, I-NP, B-PP, B-NP, O, B-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, O]","[B-LOC, O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O]"
4,4168,"[The, 70, teams, in, this, year, 's, race, will, will, trek, glaciers, ,, climb, mountains, ,, whitewater, raft, ,, horseback, ride, ,, canoe, and, mountain, bike, along, the, grueling, course, .]","[DT, CD, NNS, IN, DT, NN, POS, NN, MD, MD, NN, NNS, ,, NN, NNS, ,, NN, NN, ,, NN, NN, ,, NN, CC, NN, NN, IN, DT, JJ, NN, .]","[B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, B-NP, I-NP, B-VP, I-VP, B-NP, I-NP, O, B-NP, I-NP, O, B-NP, I-NP, O, B-NP, I-NP, O, B-NP, I-NP, I-NP, I-NP, B-PP, B-NP, I-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5,9539,"[Ashoknagar, n.a, Yellow, -, -]","[NNP, NN, NN, :, :]","[B-NP, I-NP, I-NP, O, O]","[B-LOC, O, O, O, O]"
6,7842,"[Goldman, ,, Sachs, &, Co, .]","[NNP, ,, NNP, CC, NNP, .]","[B-NP, I-NP, I-NP, I-NP, I-NP, O]","[B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O]"
7,4293,"[The, rival, Kurdistan, Democratic, Party, (, KDP, ), ,, which, accuses, Iran, of, supporting, the, PUK, ,, said, on, Thursday, that, its, forces, had, halted, an, Iranian-backed, attack, by, thousands, of, PUK, fighters, .]","[DT, JJ, NNP, NNP, NNP, (, NNP, ), ,, WDT, VBZ, NNP, IN, VBG, DT, NNP, ,, VBD, IN, NNP, IN, PRP$, NNS, VBD, VBN, DT, JJ, NN, IN, NNS, IN, NNP, NNS, .]","[B-NP, I-NP, I-NP, I-NP, I-NP, O, B-NP, O, O, B-NP, B-VP, B-NP, B-PP, B-VP, B-NP, I-NP, O, B-VP, B-PP, B-NP, B-SBAR, B-NP, I-NP, B-VP, I-VP, B-NP, I-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, O]","[O, O, B-ORG, I-ORG, I-ORG, O, B-ORG, O, O, O, O, B-LOC, O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ORG, O, O]"
8,4547,"[JYVASKLYA, ,, Finland, 1996-08-25]","[NNP, ,, NNP, CD]","[B-NP, O, B-NP, I-NP]","[B-LOC, O, B-LOC, O]"
9,5779,"[Dutch, soccer, captain, Danny, Blind, has, decided, to, end, his, international, career, ,, Ajax, spokesman, David, Endt, said, on, Sunday, .]","[JJ, NN, NN, NNP, NNP, VBZ, VBN, TO, VB, PRP$, JJ, NN, ,, NNP, NN, NNP, NNP, VBD, IN, NNP, .]","[B-NP, I-NP, I-NP, I-NP, I-NP, B-VP, I-VP, I-VP, I-VP, B-NP, I-NP, I-NP, O, B-NP, I-NP, I-NP, I-NP, B-VP, B-PP, B-NP, O]","[B-MISC, O, O, B-PER, I-PER, O, O, O, O, O, O, O, O, B-ORG, O, B-PER, I-PER, O, O, O, O]"


## Lets preprocess the data

tokenize the data

In [11]:
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

In [12]:
# this will make sure the tokenizer is fast
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [13]:
## we need to preprocess the data since if our words already tokinze it will tokenize again wchich will end up giving more ids
## we will use truncation = true to reduce the size of the text that are bigger than maximum size and will use
## is_split_into_words=true then we align them
label_all_tokens = True # we can change the value of this flag to change the strategy to set the label for first token only
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], return_offsets_mapping=True, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [14]:
## use map to apply the above function on all sentences
tokenize_data = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [15]:
tokenize_data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'offset_mapping': [[0, 0],
  [0, 2],
  [0, 7],
  [0, 6],
  [0, 4],
  [0, 2],
  [0, 7],
  [0, 7],
  [0, 4],
  [0, 1],
  [0, 0]],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

## fine tune the model

In [16]:
model = BertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

## Initiate training
### before that we need training arguments which will need folder name and other optional

In [17]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

## make all our example to same size 

In [None]:
data_collector = DataCollatorForTokenClassification(tokenizer)

## last we need metric from prediction, we use seqeval metric

In [None]:
metric = load_metric("seqeval")

## since the meric will take list of labels for predictions and reference we need to perform post preocess 


*   selecting predicted index for each token with max logit
*   cnvert it to string label
*   ignore everywhere we set label of -100 from earliear





In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## we will only get overall accuracy/f1score/recall/precision

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenize_data["train"],
    eval_dataset=tokenize_data["validation"],
    data_collator=data_collector,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## Now we train our model trainer

In [23]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2153,0.064168,0.948177,0.948672,0.948424,0.987625




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2153,0.064168,0.948177,0.948672,0.948424,0.987625
2,0.0491,0.054449,0.957829,0.962474,0.960146,0.98988
3,0.0251,0.054027,0.957295,0.963295,0.960286,0.990278




TrainOutput(global_step=2634, training_loss=0.07677226086502828, metrics={'train_runtime': 484.003, 'train_samples_per_second': 87.03, 'train_steps_per_second': 5.442, 'total_flos': 1021316467278600.0, 'train_loss': 0.07677226086502828, 'epoch': 3.0})

In [24]:
trainer.evaluate()

{'eval_loss': 0.05402688309550285,
 'eval_precision': 0.9572953736654805,
 'eval_recall': 0.963294538943599,
 'eval_f1': 0.9602855867916108,
 'eval_accuracy': 0.9902775350691851,
 'eval_runtime': 10.8218,
 'eval_samples_per_second': 300.321,
 'eval_steps_per_second': 18.851,
 'epoch': 3.0}

In [25]:
predictions, labels, _ = trainer.predict(tokenize_data["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'100': {'precision': 0.9600881171439679,
  'recall': 0.9645879442780888,
  'f1': 0.9623327704896741,
  'number': 7681},
 '_': {'precision': 0.9535609079882169,
  'recall': 0.9615586230997728,
  'f1': 0.9575430659474508,
  'number': 5723},
 'overall_precision': 0.9572953736654805,
 'overall_recall': 0.963294538943599,
 'overall_f1': 0.9602855867916108,
 'overall_accuracy': 0.9902775350691851}