The task is to try to finetune the named entity recognition pipeline in Transformers.

In [1]:
import numpy as np
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, \
AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric

In [2]:
# load the CoNLL dataset
data = load_dataset('conll2003')

Found cached dataset conll2003 (/Users/valentine/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# check the dataset structure
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
# data sample
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
# data features
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [6]:
# extract label names
label_names = data["train"].features['ner_tags'].feature.names

In [7]:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
# create a checkpoint and a tokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
# tokenize the sample
idx = 0
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [11]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [12]:
begin2inside = {1: 2, 3: 4, 5: 6, 7: 8}

In [13]:
# create a function to align labels
def align_targets(labels, word_ids):
    aligned_labels = []
    last_word = None
    for word in word_ids:
        if word is None:
            # it's a technical token like [CLS]
            label = -100
        elif word != last_word:
            # it's a new word
            label = labels[word]
        else:
            # it's the same word as before
            label = labels[word]
            # change B-<tag> to I-<tag> if necessary
            if label in begin2inside:
                label = begin2inside[label]
        # add the label 
        aligned_labels.append(label)
        # update last word
        last_word = word
    return aligned_labels

In [14]:
# try out the function
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
print(aligned_targets)

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [15]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(x, y, sep='\t')

[CLS]	None
EU	B-ORG
rejects	O
German	B-MISC
call	O
to	O
boycott	O
British	B-MISC
la	O
##mb	O
.	O
[SEP]	None


In [16]:
# test the function on a fake input
words = ['[CLS]', 'Ger', '##man', 'call', 'to', 'boycott', 'Micro', '##soft', '[SEP]']
word_ids = [None, 0, 0, 1, 2, 3, 4, 4, None]
labels = [7, 0, 0, 0, 3]
aligned_targets = align_targets(labels, word_ids)
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(words, aligned_labels):
    print(x, y, sep='\t')

[CLS]	None
Ger	B-MISC
##man	I-MISC
call	O
to	O
boycott	O
Micro	B-ORG
##soft	I-ORG
[SEP]	None


In [17]:
# create a tokenizer function
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)
    labels_batch = batch['ner_tags']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels, word_ids))
    tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs

In [18]:
# tokenize datasets
tokenized_datasets = data.map(tokenize_fn, batched=True, remove_columns=data['train'].column_names)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [20]:
# use the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [21]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels']

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [22]:
# use metrics for sequence labeling evaluation
metric = load_metric('seqeval')

  


In [23]:
# test the metrics
metric.compute(predictions=[['O', 'O', 'I-ORG', 'B-MISC']], references=[['O', 'B-ORG', 'I-ORG', 'B-MISC']])

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.75}

In [24]:
# create a function to compute mmetrics
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    str_labels = [[label_names[t] for t in label if t != -100] for label in labels]
    str_predictions = [[label_names[p] for p, t in zip(pred, targ) if t != -100] \
                       for pred, targ in zip(predictions, labels) ]
    the_metrics = metric.compute(predictions=str_predictions, references=str_labels)
    return {'precision': the_metrics['overall_precision'],
            'recall': the_metrics['overall_recall'],
            'f1': the_metrics['overall_f1'],
            'accuracy': the_metrics['overall_accuracy']
           }

In [25]:
# map ids to labels and labels to ids
id2label = {k: v for k, v in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [26]:
# use the model with our checkpoint
model = AutoModelForTokenClassification.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [27]:
# define training arguments
training_args = TrainingArguments('distilbert-finetuned-ner', evaluation_strategy='epoch', save_strategy='epoch',
                                 learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01)

In [28]:
# use the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets['train'],
                  eval_dataset=tokenized_datasets['validation'], data_collator=data_collator,
                  compute_metrics=compute_metrics, tokenizer=tokenizer)

In [29]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1023,0.083321,0.895441,0.912319,0.903801,0.976276
2,0.0464,0.072663,0.904389,0.932851,0.918399,0.982178
3,0.0261,0.072513,0.909655,0.937058,0.923153,0.982737


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner/checkpoint-1756
Configuration saved in distilbert-finetuned-ner/checkpoint-1756/config.json
Model weights saved in distilbert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner/checkpoint-1756/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner/checkpoint-3512
Configuration saved in distilbert-finetuned-ner/checkpoint-3512/config.json
Model weights saved in distilbert-finetuned-ner/checkpoint-3512/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner/checkpoint-3512/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner/checkpoint-3512/special_tokens_map.json
***** Running Evaluation *****
 

TrainOutput(global_step=5268, training_loss=0.07993725220543377, metrics={'train_runtime': 3744.2139, 'train_samples_per_second': 11.25, 'train_steps_per_second': 1.407, 'total_flos': 462023079274890.0, 'train_loss': 0.07993725220543377, 'epoch': 3.0})

In [30]:
# save the model
trainer.save_model('ner_model')

Saving model checkpoint to ner_model
Configuration saved in ner_model/config.json
Model weights saved in ner_model/pytorch_model.bin
tokenizer config file saved in ner_model/tokenizer_config.json
Special tokens file saved in ner_model/special_tokens_map.json


In [31]:
# load the pipeline with our model
ner = pipeline('token-classification', model='ner_model', aggregation_strategy='simple')

loading configuration file ner_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "ner_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 28996
}

loadin

In [32]:
# test the pipeline on a sentence
ner('Bill Gates was the CEO of Microsoft in Seattle, Washington.')

[{'entity_group': 'PER',
  'score': 0.99953294,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9981229,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9993242,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.9989794,
  'word': 'Washington',
  'start': 48,
  'end': 58}]