# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install evaluate -q
!pip install datasets -q

[K     |████████████████████████████████| 7.6 MB 28.3 MB/s 
[K     |████████████████████████████████| 5.8 MB 28.7 MB/s 
[K     |████████████████████████████████| 182 kB 73.3 MB/s 
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 72 kB 1.4 MB/s 
[K     |████████████████████████████████| 451 kB 6.5 MB/s 
[K     |████████████████████████████████| 212 kB 65.4 MB/s 
[K     |████████████████████████████████| 132 kB 51.7 MB/s 
[K     |████████████████████████████████| 127 kB 74.9 MB/s 
[?25h

In [2]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels_names = dataset["train"].features["ner_tags"].feature.names

In [4]:
labels_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
dataset["train"][:2]

{'id': ['0', '1'],
 'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn']],
 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]],
 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]}

# Data preparing

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base", add_prefix_space=True)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding = "max_length", truncation = True, is_split_into_words = True)

In [8]:
tokenized_datasets_ = dataset.map(tokenize_function, batched = True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [9]:
len(tokenized_datasets_["train"][0]["input_ids"]) == len(tokenized_datasets_["train"][0]["ner_tags"])

False

Adjusting labels to fit input

In [10]:
def tokenize_adjust_labels(samples):
    tokenized_samples = tokenizer.batch_encode_plus(samples["tokens"], is_split_into_words = True, truncation = True)

    total_adjusted_labels = []

    for k in range(len(tokenized_samples["input_ids"])):
        prev_wid = -1

        word_ids_list = tokenized_samples.word_ids(batch_index = k)

        existing_label_ids = samples["ner_tags"][k]

        i = -1

        adjusted_label_ids = []

        for word_idx in word_ids_list:
            if word_idx is None:
                adjusted_label_ids.append(-100)
            elif word_idx != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = word_idx
            else:
                label_name = labels_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        
        total_adjusted_labels.append(adjusted_label_ids)
            
    tokenized_samples["labels"] = total_adjusted_labels

    return tokenized_samples


In [11]:
out = tokenizer("Fine tune NER with BERT")
out

{'input_ids': [0, 14321, 8859, 234, 2076, 19, 163, 18854, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
out.word_ids(0)

[None, 0, 1, 2, 2, 3, 4, 4, None]

In [13]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched = True, remove_columns = ["tokens", "ner_tags", "pos_tags", "chunk_tags"])

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [15]:
tokenized_dataset["train"][:2]

{'id': ['0', '1'],
 'input_ids': [[0, 1281, 24020, 1859, 486, 7, 13978, 1089, 17988, 479, 2],
  [0, 2155, 20809, 2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100], [-100, 1, 2, -100]]}

Padding token to a constant length

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine-tuning distilroberta-base for NER task

In [17]:
from transformers import AutoModelForTokenClassification

In [18]:
model = AutoModelForTokenClassification.from_pretrained("distilroberta-base", num_labels = len(labels_names))

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream tas

## Computing metrics 

In [19]:
import numpy as np
import evaluate

In [20]:
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [21]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis = 2)

    true_predictions = [
        [labels_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [labels_names[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions = true_predictions, references = true_labels)

    return {
        "precision" : results["overall_precision"],
        "recall" : results["overall_recall"],
        "f1" : results["overall_f1"],
        "accuracy" : results["overall_accuracy"]
    }

## Using Trainer API to fine-tune

In [22]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

In [23]:
batch_size = 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
epochs = 7
weight_decay = .01
eval_steps = 100
learning_rate = 2e-5
early_stopping_patience = 3

In [24]:
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    learning_rate = learning_rate,
    logging_steps=logging_steps,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = eval_steps,
    save_total_limit = 5,
    metric_for_best_model = "eval_loss",
    load_best_model_at_end = True
) 

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)

In [26]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.244095,0.644077,0.705516,0.673398,0.931254
200,No log,0.160384,0.796768,0.824235,0.810269,0.959038
300,No log,0.109822,0.874145,0.893464,0.883699,0.972615
400,No log,0.104148,0.895884,0.896392,0.896138,0.974165
500,No log,0.085463,0.898077,0.91292,0.905438,0.977156
600,No log,0.083189,0.902691,0.925198,0.913806,0.978552
700,No log,0.080418,0.890777,0.925859,0.907979,0.97688
800,No log,0.070595,0.912286,0.92926,0.920694,0.981222
900,0.171400,0.068871,0.91141,0.927937,0.919599,0.981022
1000,0.171400,0.062108,0.926989,0.936532,0.931736,0.983569


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: id. If id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: id. If id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-1000 (score: 0.06210839003324509).


TrainOutput(global_step=1300, training_loss=0.135420168363131, metrics={'train_runtime': 212.3857, 'train_samples_per_second': 462.776, 'train_steps_per_second': 28.938, 'total_flos': 253080288625626.0, 'train_loss': 0.135420168363131, 'epoch': 1.48})

## Testing

In [27]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [
    [labels_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [labels_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: id. If id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 16


In [28]:
results

{'LOC': {'precision': 0.9044632976761343,
  'recall': 0.9014705882352941,
  'f1': 0.9029644632664335,
  'number': 2720},
 'MISC': {'precision': 0.6971770744225834,
  'recall': 0.7105492589363557,
  'f1': 0.7037996545768567,
  'number': 1147},
 'ORG': {'precision': 0.8752955082742316,
  'recall': 0.9158936301793444,
  'f1': 0.8951344817165306,
  'number': 3234},
 'PER': {'precision': 0.9534227240649259,
  'recall': 0.9427773900907188,
  'f1': 0.9480701754385965,
  'number': 2866},
 'overall_precision': 0.8844325609031491,
 'overall_recall': 0.8960569880606,
 'overall_f1': 0.8902068278096187,
 'overall_accuracy': 0.9701836382922302}

In [29]:
true_predictions

[['O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',


In [30]:
true_labels

[['O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  '

In [35]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [36]:
y_pred = []
y = []
for sent in true_predictions:
    for tag in sent:
        y_pred.append(tag)
for sent in true_labels:
    for tag in sent:
        y.append(tag)


In [37]:
print(classification_report(y_pred, y))

              precision    recall  f1-score   support

       B-LOC       0.91      0.91      0.91      2707
      B-MISC       0.73      0.73      0.73      1142
       B-ORG       0.92      0.89      0.91      3360
       B-PER       0.95      0.96      0.95      2827
       I-LOC       0.82      0.85      0.83       327
      I-MISC       0.69      0.47      0.56       427
       I-ORG       0.93      0.86      0.90      1327
       I-PER       0.99      0.96      0.98      2640
           O       0.99      1.00      0.99     45579

    accuracy                           0.97     60336
   macro avg       0.88      0.85      0.86     60336
weighted avg       0.97      0.97      0.97     60336



In [38]:
print(f"Accuracy = {accuracy_score(y_pred, y):.6f}, \
        Precision (weighted) = {precision_score(y_pred, y, average = 'weighted'):.6f}, \
        Recall (weighted) = {recall_score(y_pred, y, average = 'weighted'):.6f}, \
        F1 (weighted) = {f1_score(y_pred, y, average = 'weighted'):.6f}")

Accuracy = 0.970184,         Precision (weighted) = 0.969406,         Recall (weighted) = 0.970184,         F1 (weighted) = 0.969575


In [39]:
from sklearn.metrics import confusion_matrix

In [40]:
print(confusion_matrix(y_pred, y))

[[ 2464    92    83    28     8     0     3     0    29]
 [   58   837    78    13     1    13     1     0   141]
 [  125   105  2991    58     0     3     8     1    69]
 [   27    10    34  2712     0     0     0     3    41]
 [    8     0     0     0   279     8    20     1    11]
 [    5    19     0     0     1   201    17     1   183]
 [   11     2    17     2    39    33  1146    18    59]
 [    0     7     0    24     8     5    18  2537    41]
 [   22    75    31    29     6    28    18     0 45370]]


In [41]:
def print_clarification(idx):
    for sentence, prediction, ground_truth in zip(dataset["test"][idx]["tokens"], true_predictions[idx], true_labels[idx]):
        print(sentence, prediction, ground_truth)

In [42]:
print_clarification(0)

SOCCER O O
- O O
JAPAN O O
GET O O
LUCKY B-LOC B-LOC
WIN B-LOC B-LOC
, B-LOC B-LOC
CHINA O O
IN O O
SURPRISE O O
DEFEAT O O
. O O


In [43]:
print_clarification(10)

Takuya B-PER B-PER
Takagi B-PER B-PER
scored I-PER I-PER
the I-PER I-PER
winner O O
in O O
the O O
88th O O
minute O O
, O O
rising O O
to O O
head O O
a O O
Hiroshige O O
Yanagimoto O O
cross O O
towards B-PER B-PER
the B-PER B-PER
Syrian I-PER I-PER
goal I-PER I-PER
which I-PER I-PER
goalkeeper O O
Salem O O
Bitar O O
appeared B-MISC B-MISC
to O O
have O O
covered O O
but B-PER B-PER
then I-PER I-PER
allowed I-PER I-PER
to O O
slip O O
into O O
the O O
net O O
. O O


In [44]:
print_clarification(20)

Hosts O O
UAE O O
play B-LOC B-LOC
Kuwait O O
and B-LOC B-LOC
South O O
Korea B-LOC B-LOC
take I-LOC I-LOC
on O O
Indonesia O O
on B-LOC B-LOC
Saturday O O
in O O
Group O O
A B-MISC O
matches I-MISC O
. O O
