# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install datasets -q

[K     |████████████████████████████████| 7.6 MB 15.6 MB/s 
[K     |████████████████████████████████| 5.8 MB 17.8 MB/s 
[K     |████████████████████████████████| 182 kB 28.4 MB/s 
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 451 kB 31.4 MB/s 
[K     |████████████████████████████████| 132 kB 73.5 MB/s 
[K     |████████████████████████████████| 212 kB 78.3 MB/s 
[K     |████████████████████████████████| 127 kB 77.1 MB/s 
[?25h

In [2]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels_names = dataset["train"].features["ner_tags"].feature.names

In [4]:
labels_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
dataset["train"][:2]

{'id': ['0', '1'],
 'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn']],
 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]],
 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]}

# Data preparing

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding = "max_length", truncation = True, is_split_into_words = True)

In [8]:
tokenized_datasets_ = dataset.map(tokenize_function, batched = True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [9]:
len(tokenized_datasets_["train"][0]["input_ids"]) == len(tokenized_datasets_["train"][0]["ner_tags"])

False

Adjusting labels to fit input

In [10]:
def tokenize_adjust_labels(samples):
    tokenized_samples = tokenizer.batch_encode_plus(samples["tokens"], is_split_into_words = True, truncation = True)

    total_adjusted_labels = []

    for k in range(len(tokenized_samples["input_ids"])):
        prev_wid = -1

        word_ids_list = tokenized_samples.word_ids(batch_index = k)

        existing_label_ids = samples["ner_tags"][k]

        i = -1

        adjusted_label_ids = []

        for word_idx in word_ids_list:
            if word_idx is None:
                adjusted_label_ids.append(-100)
            elif word_idx != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = word_idx
            else:
                label_name = labels_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        
        total_adjusted_labels.append(adjusted_label_ids)
            
    tokenized_samples["labels"] = total_adjusted_labels

    return tokenized_samples


In [11]:
out = tokenizer("Fine tune NER with BERT")
out

{'input_ids': [101, 2986, 8694, 11265, 2099, 2007, 14324, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
out.word_ids(0)

[None, 0, 1, 2, 2, 3, 4, None]

In [13]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched = True, remove_columns = ["tokens", "ner_tags", "pos_tags", "chunk_tags"])

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [15]:
tokenized_dataset["train"][:2]

{'id': ['0', '1'],
 'input_ids': [[101,
   7327,
   19164,
   2446,
   2655,
   2000,
   17757,
   2329,
   12559,
   1012,
   102],
  [101, 2848, 13934, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100], [-100, 1, 2, -100]]}

Padding token to a constant length

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine-tuning BERT for NER task

In [17]:
from transformers import AutoModelForTokenClassification

In [18]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = len(labels_names))

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

## Computing metrics 

In [19]:
import numpy as np
from datasets import load_metric

In [20]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [21]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis = 2)

    true_predictions = [
        [labels_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [labels_names[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions = true_predictions, references = true_labels)

    return {
        "precision" : results["overall_precision"],
        "recall" : results["overall_recall"],
        "f1" : results["overall_f1"],
        "accuracy" : results["overall_accuracy"]
    }

## Using Trainer API to fine-tune

In [22]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

In [23]:
batch_size = 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
epochs = 7
weight_decay = .01
eval_steps = 100
learning_rate = 2e-5
early_stopping_patience = 3

In [24]:
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    learning_rate = learning_rate,
    logging_steps=logging_steps,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = eval_steps,
    save_total_limit = 5,
    metric_for_best_model = "eval_loss",
    load_best_model_at_end = True
) 

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)

In [26]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.242781,0.644134,0.681172,0.662136,0.93512
200,No log,0.157578,0.772102,0.80311,0.787301,0.959569
300,No log,0.117596,0.868501,0.865197,0.866846,0.970213
400,No log,0.091247,0.876586,0.896297,0.886332,0.975027
500,No log,0.081036,0.886267,0.907484,0.89675,0.977139
600,No log,0.07922,0.898433,0.917329,0.907783,0.978808
700,No log,0.076991,0.885859,0.919454,0.902344,0.97768
800,No log,0.072795,0.894046,0.922251,0.90793,0.979522
900,0.172100,0.068278,0.907658,0.921468,0.914511,0.980762
1000,0.172100,0.069756,0.914197,0.921356,0.917762,0.981111


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: id. If id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: id. If id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: id. If id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 16
The fol

TrainOutput(global_step=3600, training_loss=0.06526716222365697, metrics={'train_runtime': 574.5474, 'train_samples_per_second': 171.069, 'train_steps_per_second': 10.697, 'total_flos': 698252599489032.0, 'train_loss': 0.06526716222365697, 'epoch': 4.1})

## Testing

In [27]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [
    [labels_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [labels_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

The following columns in the test set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: id. If id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 16


In [28]:
results

{'LOC': {'precision': 0.9048734770384255,
  'recall': 0.9091337099811676,
  'f1': 0.9069985908877407,
  'number': 2124},
 'MISC': {'precision': 0.7878128400435256,
  'recall': 0.7269076305220884,
  'f1': 0.7561357702349869,
  'number': 996},
 'ORG': {'precision': 0.8412408759124088,
  'recall': 0.8906491499227203,
  'f1': 0.8652402402402404,
  'number': 2588},
 'PER': {'precision': 0.9602698650674663,
  'recall': 0.9426048565121413,
  'f1': 0.9513553657630895,
  'number': 2718},
 'overall_precision': 0.889020210377024,
 'overall_recall': 0.8927130310942322,
 'overall_f1': 0.8908627938651033,
 'overall_accuracy': 0.9750535184034251}

In [29]:
true_predictions

[['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',


In [30]:
true_labels

[['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',


In [31]:
from sklearn.metrics import classification_report

In [33]:
y_pred = []
y = []
for sent in true_predictions:
    for tag in sent:
        y_pred.append(tag)
for sent in true_labels:
    for tag in sent:
        y.append(tag)


In [35]:
print(classification_report(y_pred, y))

              precision    recall  f1-score   support

       B-LOC       0.91      0.92      0.92      2107
      B-MISC       0.75      0.83      0.79       905
       B-ORG       0.91      0.87      0.89      2696
       B-PER       0.94      0.97      0.96      2655
       I-LOC       0.86      0.78      0.82       306
      I-MISC       0.66      0.66      0.66       241
       I-ORG       0.88      0.85      0.86      1081
       I-PER       0.99      0.98      0.99      2523
           O       0.99      0.99      0.99     45410

    accuracy                           0.98     57924
   macro avg       0.88      0.87      0.88     57924
weighted avg       0.98      0.98      0.98     57924



In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
print(confusion_matrix(y_pred, y))

[[ 1941    38    72    13     4     2     3     0    34]
 [   31   750    46     9     0    12     0     0    57]
 [   93    72  2353    78     0     5     9     0    86]
 [    7    17    35  2567     0     0     0     1    28]
 [   21     2     0     0   239     4    24     1    15]
 [    2    10     0     0     1   160     4     1    63]
 [    7     1    19     6    23    13   921    19    72]
 [    1     0     0    18     2     6    17  2476     3]
 [   21   106    63    27     8    39    72     2 45072]]
