## install transformers

In [23]:
! pip install transformers datasets evaluate seqeval accelerate


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import libraries

In [24]:
from datasets import load_dataset, load_metric, ClassLabel, Sequence
import random
import pandas as pd
import numpy as np
from IPython.display import display, HTML

import transformers
from transformers import AutoTokenizer, DistilBertForTokenClassification, TrainingArguments, Trainer
from transformers import DistilBertTokenizerFast

from transformers import DataCollatorForTokenClassification




## Load the dataset

In [25]:
data = load_dataset('SpeedOfMagic/ontonotes_english')



  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 59924
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13900
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8262
    })
})

In [27]:
data['train'][1]

{'tokens': ['Newsnight', 'returns', 'to', 'duo', 'action', 'tonight', '.'],
 'ner_tags': [31, 0, 0, 0, 0, 17, 0]}

## We are using NER task for token classification

In [28]:
task = "ner"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [29]:
label = data['train'].features[f"{task}_tags"].feature.names
print(label)

['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE']


## just to see how dataset look like

In [30]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [31]:
show_random_elements(data['train'])

Unnamed: 0,tokens,ner_tags
0,"[Right, .]","[O, O]"
1,"[Yeah, and, by, the, way, I, was, commenting, on, the, frog, -, walk, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,"[And, we, 've, got, to, do, more, to, improve, our, intelligence, .]","[O, O, O, O, O, O, O, O, O, O, O, O]"
3,"[One, writer, asked, today, ,, Will, black, Americans, be, eternal, victims, ?, Will, Mr., Bush, and, millions, of, blacks, be, permanently, estranged, ?]","[B-CARDINAL, O, O, O, O, O, O, B-NORP, O, O, O, O, O, O, B-PERSON, O, B-CARDINAL, O, O, O, O, O, O]"
4,"[The, first, and, second, Asian, Development, Bank, 's, secondary, region, economic, co-operation, meetings, were, held, in, Manila, in, October, ,, 1992, and, in, August, ,, 1993, .]","[O, B-ORDINAL, O, B-ORDINAL, B-ORG, I-ORG, I-ORG, I-ORG, B-ORDINAL, O, O, O, O, O, O, O, B-GPE, O, B-DATE, I-DATE, I-DATE, O, O, B-DATE, I-DATE, I-DATE, O]"
5,"[After, graduating, from, college, ,, Yeh, opened, the, Mingli, Plastics, Factory, .]","[O, O, O, O, O, B-PERSON, O, B-ORG, I-ORG, I-ORG, I-ORG, O]"
6,"[Some, also, includes, shells, or, hornblende, .]","[O, O, O, O, O, O, O]"
7,"[,, this, is, not, just, during, Putin, 's, tenure, ,, that, is, ,, it, has, been, like, this, for, the, 60, years, after, the, war, .]","[O, O, O, O, O, O, B-PERSON, O, O, O, O, O, O, O, O, O, O, O, O, B-DATE, I-DATE, I-DATE, O, O, O, O]"
8,"[They, 'd, rather, use, --]","[O, O, O, O, O]"
9,"[Uh, vulnerable, senators, Conrad, Burns, in, Montana, Lincoln, Chafee, Rhode, Island, Mike, DeWine, Ohio, Santorum, in, Pennsylvania, Jim, Talent, Missouri, and, the, Tennessee, seat, .]","[O, O, O, B-PERSON, I-PERSON, O, B-GPE, B-PERSON, I-PERSON, B-GPE, I-GPE, B-PERSON, I-PERSON, B-GPE, B-PERSON, O, B-GPE, B-PERSON, I-PERSON, B-GPE, O, O, B-GPE, O, O]"


## Lets preprocess the data

tokenize the data

In [32]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)

In [33]:
# this will make sure the tokenizer is fast
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [34]:
## we need to preprocess the data since if our words already tokinze it will tokenize again wchich will end up giving more ids
## we will use truncation = true to reduce the size of the text that are bigger than maximum size and will use
## is_split_into_words=true then we align them
label_all_tokens = True # we can change the value of this flag to change the strategy to set the label for first token only
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], return_offsets_mapping=True, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [35]:
## use map to apply the above function on all sentences
tokenize_data = data.map(tokenize_and_align_labels, batched=True)



Map:   0%|          | 0/13900 [00:00<?, ? examples/s]



In [36]:
tokenize_data['train'][1]

{'tokens': ['Newsnight', 'returns', 'to', 'duo', 'action', 'tonight', '.'],
 'ner_tags': [31, 0, 0, 0, 0, 17, 0],
 'input_ids': [101, 2739, 15864, 5651, 2000, 6829, 2895, 3892, 1012, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'offset_mapping': [[0, 0],
  [0, 4],
  [4, 9],
  [0, 7],
  [0, 2],
  [0, 3],
  [0, 6],
  [0, 7],
  [0, 1],
  [0, 0]],
 'labels': [-100, 31, 31, 0, 0, 0, 0, 17, 0, -100]}

## fine tune the model

In [37]:
model = DistilBertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

## Initiate training
### before that we need training arguments which will need folder name and other optional

In [38]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

## make all our example to same size 

In [39]:
data_collector = DataCollatorForTokenClassification(tokenizer)

## last we need metric from prediction, we use seqeval metric

In [40]:
metric = load_metric("seqeval")

## since the meric will take list of labels for predictions and reference we need to perform post preocess 


*   selecting predicted index for each token with max logit
*   cnvert it to string label
*   ignore everywhere we set label of -100 from earliear





In [41]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## we will only get overall accuracy/f1score/recall/precision


In [42]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenize_data["train"],
    eval_dataset=tokenize_data["validation"],
    data_collator=data_collector,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## Now we train our trainer

In [43]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1015,0.137869,0.826233,0.878683,0.851651,0.97096
2,0.07,0.137963,0.833475,0.888041,0.859893,0.972141
3,0.0472,0.144148,0.836121,0.889366,0.861922,0.972312




TrainOutput(global_step=11238, training_loss=0.09653069222391363, metrics={'train_runtime': 1285.6574, 'train_samples_per_second': 139.829, 'train_steps_per_second': 8.741, 'total_flos': 2581133925377784.0, 'train_loss': 0.09653069222391363, 'epoch': 3.0})

In [44]:
trainer.evaluate()

{'eval_loss': 0.14414843916893005,
 'eval_precision': 0.8361212853528679,
 'eval_recall': 0.8893664022492218,
 'eval_f1': 0.861922324617316,
 'eval_accuracy': 0.9723123719343795,
 'eval_runtime': 39.9626,
 'eval_samples_per_second': 347.825,
 'eval_steps_per_second': 21.745,
 'epoch': 3.0}

In [45]:
predictions, labels, _ = trainer.predict(tokenize_data["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'0': {'precision': 0.8827751196172249,
  'recall': 0.9066339066339066,
  'f1': 0.8945454545454545,
  'number': 407},
 '1': {'precision': 0.821656050955414,
  'recall': 0.8459016393442623,
  'f1': 0.8336025848142165,
  'number': 305},
 '100': {'precision': 0.8453472309431119,
  'recall': 0.8873582915897706,
  'f1': 0.8658434626020967,
  'number': 30344},
 '2': {'precision': 0.8295739348370927,
  'recall': 0.8619791666666666,
  'f1': 0.8454661558109833,
  'number': 384},
 '3': {'precision': 0.5733333333333334,
  'recall': 0.6417910447761194,
  'f1': 0.6056338028169014,
  'number': 67},
 '4': {'precision': 0.7024793388429752,
  'recall': 0.7264957264957265,
  'f1': 0.7142857142857142,
  'number': 117},
 '5': {'precision': 0.8963503649635036,
  'recall': 0.8771428571428571,
  'f1': 0.8866425992779784,
  'number': 700},
 '6': {'precision': 0.875,
  'recall': 0.8730853391684902,
  'f1': 0.8740416210295728,
  'number': 457},
 '7': {'precision': 0.8648648648648649,
  'recall': 0.8695652173913