# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install datasets -q

In [2]:
from datasets import load_dataset

dataset = load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels_names = dataset["train"].features["ner_tags"].feature.names

In [4]:
labels_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
dataset["train"][:2]

{'id': ['0', '1'],
 'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn']],
 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]],
 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]}

# Data preparing

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding = "max_length", truncation = True, is_split_into_words = True)

In [8]:
tokenized_datasets_ = dataset.map(tokenize_function, batched = True)



  0%|          | 0/4 [00:00<?, ?ba/s]



In [9]:
len(tokenized_datasets_["train"][0]["input_ids"]) == len(tokenized_datasets_["train"][0]["ner_tags"])

False

Adjusting labels to fit input

In [10]:
def tokenize_adjust_labels(samples):
    tokenized_samples = tokenizer.batch_encode_plus(samples["tokens"], is_split_into_words = True, truncation = True)

    total_adjusted_labels = []

    for k in range(len(tokenized_samples["input_ids"])):
        prev_wid = -1

        word_ids_list = tokenized_samples.word_ids(batch_index = k)

        existing_label_ids = samples["ner_tags"][k]

        i = -1

        adjusted_label_ids = []

        for word_idx in word_ids_list:
            if word_idx is None:
                adjusted_label_ids.append(-100)
            elif word_idx != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = word_idx
            else:
                label_name = labels_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        
        total_adjusted_labels.append(adjusted_label_ids)
            
    tokenized_samples["labels"] = total_adjusted_labels

    return tokenized_samples


In [11]:
out = tokenizer("Fine tune NER with BERT")
out

{'input_ids': [101, 2986, 8694, 11265, 2099, 2007, 14324, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
out.word_ids(0)

[None, 0, 1, 2, 2, 3, 4, None]

In [13]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched = True, remove_columns = ["tokens", "ner_tags", "pos_tags", "chunk_tags"])



  0%|          | 0/4 [00:00<?, ?ba/s]



In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [15]:
tokenized_dataset["train"][:2]

{'id': ['0', '1'],
 'input_ids': [[101,
   7327,
   19164,
   2446,
   2655,
   2000,
   17757,
   2329,
   12559,
   1012,
   102],
  [101, 2848, 13934, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100], [-100, 1, 2, -100]]}

Padding token to a constant length

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine-tuning BERT for NER task

In [17]:
from transformers import AutoModelForTokenClassification

In [18]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = len(labels_names))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

## Computing metrics 

In [19]:
import numpy as np
from datasets import load_metric

In [20]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [21]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis = 2)

    true_predictions = [
        [labels_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [labels_names[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions = true_predictions, references = true_labels)

    return {
        "precision" : results["overall_precision"],
        "recall" : results["overall_recall"],
        "f1" : results["overall_f1"],
        "accuracy" : results["overall_accuracy"]
    }

## Using Trainer API to fine-tune

In [22]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

In [23]:

batch_size = 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
epochs = 16
eval_steps = 50

In [24]:
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    logging_steps=logging_steps,

    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = eval_steps,
    save_total_limit = 5,
    metric_for_best_model = "eval_loss",
    load_best_model_at_end = True
) 

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [26]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.260645,0.645843,0.643025,0.644431,0.928861
100,No log,0.153495,0.80041,0.829064,0.814485,0.960935
150,No log,0.126832,0.816661,0.852109,0.834009,0.965415
200,No log,0.108113,0.849517,0.875937,0.862525,0.970086
250,No log,0.097337,0.867663,0.896297,0.881748,0.973549
300,No log,0.102966,0.884176,0.88388,0.884028,0.972771
350,No log,0.088903,0.878728,0.903009,0.890703,0.974772
400,No log,0.080314,0.890706,0.901667,0.896153,0.976663
450,No log,0.079424,0.902983,0.914196,0.908555,0.978696
500,No log,0.075079,0.888419,0.915651,0.901829,0.977838




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-1000 (score: 0.07352860271930695).


TrainOutput(global_step=1300, training_loss=0.10163768621591421, metrics={'train_runtime': 294.3251, 'train_samples_per_second': 763.292, 'train_steps_per_second': 47.73, 'total_flos': 252292187379834.0, 'train_loss': 0.10163768621591421, 'epoch': 1.48})

## Testing

In [27]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [
    [labels_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [labels_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

The following columns in the test set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: id. If id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3453
  Batch size = 16


In [28]:
results

{'LOC': {'precision': 0.9217171717171717,
  'recall': 0.8592278719397364,
  'f1': 0.8893762183235867,
  'number': 2124},
 'MISC': {'precision': 0.8387096774193549,
  'recall': 0.6526104417670683,
  'f1': 0.7340485601355167,
  'number': 996},
 'ORG': {'precision': 0.7869665894806483,
  'recall': 0.9192426584234931,
  'f1': 0.8479771876670824,
  'number': 2588},
 'PER': {'precision': 0.9538175046554935,
  'recall': 0.9422369389256806,
  'f1': 0.9479918563760873,
  'number': 2718},
 'overall_precision': 0.8761668439087794,
 'overall_recall': 0.8800142416330405,
 'overall_f1': 0.8780863283794185,
 'overall_accuracy': 0.9709619501415648}