# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install datasets -q

In [2]:
from datasets import load_dataset

dataset = load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels_names = dataset["train"].features["ner_tags"].feature.names

In [4]:
labels_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
dataset["train"][:2]

{'id': ['0', '1'],
 'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn']],
 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]],
 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]}

# Data preparing

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base", add_prefix_space=True)

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding = "max_length", truncation = True, is_split_into_words = True)

In [11]:
tokenized_datasets_ = dataset.map(tokenize_function, batched = True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
len(tokenized_datasets_["train"][0]["input_ids"]) == len(tokenized_datasets_["train"][0]["ner_tags"])

False

Adjusting labels to fit input

In [13]:
def tokenize_adjust_labels(samples):
    tokenized_samples = tokenizer.batch_encode_plus(samples["tokens"], is_split_into_words = True, truncation = True)

    total_adjusted_labels = []

    for k in range(len(tokenized_samples["input_ids"])):
        prev_wid = -1

        word_ids_list = tokenized_samples.word_ids(batch_index = k)

        existing_label_ids = samples["ner_tags"][k]

        i = -1

        adjusted_label_ids = []

        for word_idx in word_ids_list:
            if word_idx is None:
                adjusted_label_ids.append(-100)
            elif word_idx != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = word_idx
            else:
                label_name = labels_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        
        total_adjusted_labels.append(adjusted_label_ids)
            
    tokenized_samples["labels"] = total_adjusted_labels

    return tokenized_samples


In [14]:
out = tokenizer("Fine tune NER with BERT")
out

{'input_ids': [0, 14321, 8859, 234, 2076, 19, 163, 18854, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
out.word_ids(0)

[None, 0, 1, 2, 2, 3, 4, 4, None]

In [16]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched = True, remove_columns = ["tokens", "ner_tags", "pos_tags", "chunk_tags"])

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [17]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [18]:
tokenized_dataset["train"][:2]

{'id': ['0', '1'],
 'input_ids': [[0, 1281, 24020, 1859, 486, 7, 13978, 1089, 17988, 479, 2],
  [0, 2155, 20809, 2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100], [-100, 1, 2, -100]]}

Padding token to a constant length

In [19]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine-tuning distilroberta-base for NER task

In [20]:
from transformers import AutoModelForTokenClassification

In [21]:
model = AutoModelForTokenClassification.from_pretrained("distilroberta-base", num_labels = len(labels_names))

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream tas

## Computing metrics 

In [22]:
import numpy as np
from datasets import load_metric

In [23]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [24]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis = 2)

    true_predictions = [
        [labels_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [labels_names[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions = true_predictions, references = true_labels)

    return {
        "precision" : results["overall_precision"],
        "recall" : results["overall_recall"],
        "f1" : results["overall_f1"],
        "accuracy" : results["overall_accuracy"]
    }

## Using Trainer API to fine-tune

In [25]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

In [26]:
batch_size = 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
epochs = 7
weight_decay = .01
eval_steps = 100
learning_rate = 2e-5
early_stopping_patience = 3

In [27]:
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    learning_rate = learning_rate,
    logging_steps=logging_steps,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = eval_steps,
    save_total_limit = 5,
    metric_for_best_model = "eval_loss",
    load_best_model_at_end = True
) 

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)

In [29]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.239089,0.65647,0.723555,0.688382,0.933202
200,No log,0.155595,0.781873,0.825368,0.803032,0.959268
300,No log,0.113663,0.880867,0.886192,0.883522,0.971909
400,No log,0.098906,0.890777,0.901209,0.895962,0.975208
500,No log,0.08451,0.898758,0.915565,0.907083,0.977801
600,No log,0.079803,0.903597,0.927748,0.915513,0.980071
700,No log,0.07775,0.897803,0.93011,0.913671,0.978261
800,No log,0.069207,0.912401,0.927654,0.919964,0.980884
900,0.170300,0.064381,0.917985,0.931337,0.924613,0.982296
1000,0.170300,0.060232,0.929166,0.937854,0.93349,0.983953


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: id. If id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: id. If id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-1000 (score: 0.06023208424448967).


TrainOutput(global_step=1300, training_loss=0.1343374883211576, metrics={'train_runtime': 207.1564, 'train_samples_per_second': 474.458, 'train_steps_per_second': 29.668, 'total_flos': 253080288625626.0, 'train_loss': 0.1343374883211576, 'epoch': 1.48})

## Testing

In [30]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [
    [labels_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [labels_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: id. If id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 16


In [31]:
results

{'LOC': {'precision': 0.9013015184381779,
  'recall': 0.9165441176470588,
  'f1': 0.9088589135982501,
  'number': 2720},
 'MISC': {'precision': 0.6949293433083957,
  'recall': 0.7288578901482128,
  'f1': 0.7114893617021277,
  'number': 1147},
 'ORG': {'precision': 0.8900302114803625,
  'recall': 0.9109461966604824,
  'f1': 0.9003667481662592,
  'number': 3234},
 'PER': {'precision': 0.9578761061946902,
  'recall': 0.9441730635031402,
  'f1': 0.9509752240379546,
  'number': 2866},
 'overall_precision': 0.8888558986539984,
 'overall_recall': 0.9010735426908799,
 'overall_f1': 0.8949230232674007,
 'overall_accuracy': 0.9709294616812516}

In [32]:
true_predictions

[['O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',


In [33]:
true_labels

[['O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  '

In [34]:
from sklearn.metrics import classification_report

In [35]:
y_pred = []
y = []
for sent in true_predictions:
    for tag in sent:
        y_pred.append(tag)
for sent in true_labels:
    for tag in sent:
        y.append(tag)


In [36]:
print(classification_report(y_pred, y))

              precision    recall  f1-score   support

       B-LOC       0.92      0.91      0.91      2755
      B-MISC       0.75      0.74      0.75      1160
       B-ORG       0.92      0.91      0.91      3289
       B-PER       0.95      0.96      0.96      2818
       I-LOC       0.87      0.84      0.85       351
      I-MISC       0.68      0.45      0.54       434
       I-ORG       0.93      0.86      0.89      1331
       I-PER       0.99      0.96      0.97      2657
           O       0.99      1.00      0.99     45541

    accuracy                           0.97     60336
   macro avg       0.89      0.85      0.87     60336
weighted avg       0.97      0.97      0.97     60336



In [37]:
from sklearn.metrics import confusion_matrix

In [38]:
print(confusion_matrix(y_pred, y))

[[ 2503    88    91    31     5     2     6     0    29]
 [   58   863    78    11     0    12     0     0   138]
 [   91    96  2977    56     0     3     7     1    58]
 [   24     5    24  2715     0     0     0     1    49]
 [    4     0     1     0   296    10    26     1    13]
 [    7    21     0     0     4   197    16     1   188]
 [   11     3    30     1    26    30  1145    17    68]
 [    1     7     2    27     6     5    14  2540    55]
 [   21    64    31    25     5    32    17     0 45346]]


In [50]:
def print_clarification(idx):
    for sentence, prediction, ground_truth in zip(dataset["test"][idx]["tokens"], true_predictions[idx], true_labels[idx]):
        print(sentence, prediction, ground_truth)

In [51]:
print_clarification(0)

SOCCER O O
- O O
JAPAN O O
GET O O
LUCKY B-LOC B-LOC
WIN B-LOC B-LOC
, B-LOC B-LOC
CHINA O O
IN O O
SURPRISE O O
DEFEAT O O
. O O


In [52]:
print_clarification(10)

Takuya B-PER B-PER
Takagi B-PER B-PER
scored I-PER I-PER
the I-PER I-PER
winner O O
in O O
the O O
88th O O
minute O O
, O O
rising O O
to O O
head O O
a O O
Hiroshige O O
Yanagimoto O O
cross O O
towards B-PER B-PER
the B-PER B-PER
Syrian I-PER I-PER
goal I-PER I-PER
which I-PER I-PER
goalkeeper O O
Salem O O
Bitar O O
appeared B-MISC B-MISC
to O O
have O O
covered O O
but B-PER B-PER
then I-PER I-PER
allowed I-PER I-PER
to O O
slip O O
into O O
the O O
net O O
. O O


In [53]:
print_clarification(20)

Hosts O O
UAE O O
play B-LOC B-LOC
Kuwait O O
and B-LOC B-LOC
South O O
Korea B-LOC B-LOC
take I-LOC I-LOC
on O O
Indonesia O O
on B-LOC B-LOC
Saturday O O
in O O
Group O O
A B-MISC O
matches I-MISC O
. O O
