The task is to try to finetune the part of speech tagging pipeline in Transformers.

In [1]:
import json
import nltk
import numpy as np
from nltk.corpus import brown
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification,\
AutoModelForTokenClassification, TrainingArguments, Trainer

In [2]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /Users/valentine/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/valentine/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
# use the Brown corpus
corpus = brown.tagged_sents(tagset='universal')

In [4]:
# a sample from the corpus
corpus[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

In [5]:
# split tags and words
inputs = []
targets = []

for sentence_tag_pairs in corpus:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [6]:
# convert to json
with open('data.json', 'w') as f:
    for x, y in zip(inputs, targets):
        j = {'inputs': x, 'targets': y}
        s = json.dumps(j)
        f.write(f"{s}\n")

In [7]:
# load the dataset
data = load_dataset('json', data_files='data.json')

Using custom data configuration default-88f9b318b5682e5c


Downloading and preparing dataset json/default to /Users/valentine/.cache/huggingface/datasets/json/default-88f9b318b5682e5c/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/valentine/.cache/huggingface/datasets/json/default-88f9b318b5682e5c/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# check the dataset structure
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [9]:
# use a sample from the dataset
small = data['train'].shuffle().select(range(20000))
small

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 20000
})

In [10]:
# make train and test sets
data = small.train_test_split()

In [11]:
data['train'][0]

{'inputs': ['She',
  'had',
  'a',
  'dried-out',
  'quality',
  '--',
  'a',
  'gray',
  ',',
  'lean',
  'woman',
  ',',
  'not',
  'unattractive',
  '.'],
 'targets': ['PRON',
  'VERB',
  'DET',
  'ADJ',
  'NOUN',
  '.',
  'DET',
  'ADJ',
  '.',
  'ADJ',
  'NOUN',
  '.',
  'ADV',
  'ADJ',
  '.']}

In [12]:
data['train'].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [13]:
# get the set of tags
target_set = set()
for target in targets:
    target_set = target_set.union(target)
target_set

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [14]:
# map ids to labels and labels to ids
target_list = list(target_set)
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

In [15]:
# create a checkpoint and a tokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [16]:
# tokenize a sample sentence
idx = 0
t = tokenizer(data['train'][idx]['inputs'], is_split_into_words=True)
t

{'input_ids': [101, 1153, 1125, 170, 9490, 118, 1149, 3068, 118, 118, 170, 5021, 117, 8290, 1590, 117, 1136, 8362, 19934, 19366, 3946, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
t.tokens()

['[CLS]',
 'She',
 'had',
 'a',
 'dried',
 '-',
 'out',
 'quality',
 '-',
 '-',
 'a',
 'gray',
 ',',
 'lean',
 'woman',
 ',',
 'not',
 'un',
 '##att',
 '##rac',
 '##tive',
 '.',
 '[SEP]']

In [18]:
t.word_ids()

[None,
 0,
 1,
 2,
 3,
 3,
 3,
 4,
 5,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 13,
 13,
 13,
 14,
 None]

In [19]:
# create a function to align labels
def align_targets(labels, word_ids):
    aligned_labels = []
    for word in word_ids:
        if word is None:
            label = -100
        else:
            label = label2id[labels[word]]
        aligned_labels.append(label)
    return aligned_labels

In [20]:
labels = data['train'][idx]['targets']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[-100,
 5,
 0,
 9,
 4,
 4,
 4,
 10,
 11,
 11,
 9,
 4,
 11,
 4,
 10,
 11,
 2,
 4,
 4,
 4,
 4,
 11,
 -100]

In [21]:
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(x, y, sep='\t')

[CLS]	None
She	PRON
had	VERB
a	DET
dried	ADJ
-	ADJ
out	ADJ
quality	NOUN
-	.
-	.
a	DET
gray	ADJ
,	.
lean	ADJ
woman	NOUN
,	.
not	ADV
un	ADJ
##att	ADJ
##rac	ADJ
##tive	ADJ
.	.
[SEP]	None


In [22]:
# create a function to tokenize data batches
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(batch['inputs'], truncation=True, is_split_into_words=True)
    labels_batch = batch['targets']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels, word_ids))
        
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [23]:
# tokenize datasets
tokenized_datasets = data.map(tokenize_fn, batched=True, remove_columns=data['train'].column_names)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [24]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [25]:
# use the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [26]:
def flatten(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [27]:
# create a function to compute evaluation metrics
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    labels_jagged = [[t for t in label if t != -100] for label in labels]
    pred_jagged = [[p for p, t in zip(ps, ts) if t != -100] for ps, ts in zip(predictions, labels)]
    labels_flat = flatten(labels_jagged)
    predictions_flat = flatten(pred_jagged)
    acc = accuracy_score(labels_flat, predictions_flat)
    f1 = f1_score(labels_flat, predictions_flat, average='macro')
    return {'f1': f1, 'accuracy': acc}

In [28]:
# test the function on dummy data
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
  [0.8, 0.1, 0.1],
  [0.8, 0.1, 0.1],
  [0.8, 0.1, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
]])
compute_metrics((logits, labels))

{'f1': 0.6, 'accuracy': 0.8}

In [29]:
# use the model with our checkpoint
model = AutoModelForTokenClassification.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [30]:
# define training arguments
training_args = TrainingArguments('distilbert-finetuning-ner',
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=2)

In [31]:
# use the trainer and train the model
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=tokenized_datasets['train'],
                  eval_dataset=tokenized_datasets['test'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)
trainer.train()

***** Running training *****
  Num examples = 15000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3750
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0685,0.053155,0.943835,0.984382
2,0.0256,0.050509,0.960914,0.985831


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to distilbert-finetuning-ner/checkpoint-1875
Configuration saved in distilbert-finetuning-ner/checkpoint-1875/config.json
Model weights saved in distilbert-finetuning-ner/checkpoint-1875/pytorch_model.bin
tokenizer config file saved in distilbert-finetuning-ner/checkpoint-1875/tokenizer_config.json
Special tokens file saved in distilbert-finetuning-ner/checkpoint-1875/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to distilbert-finetuning-ner/checkpoint-3750
Configuration saved in distilbert-finetuning-ner/checkpoint-3750/config.json
Model weights saved in distilbert-finetuning-ner/checkpoint-3750/pytorch_model.bin
tokenizer config file saved in distilbert-finetuning-ner/checkpoint-3750/tokenizer_config.json
Special tokens file saved in distilbert-finetuning-ner/checkpoint-3750/special_tokens_map.json


Training completed. 

TrainOutput(global_step=3750, training_loss=0.06804031995137533, metrics={'train_runtime': 2962.5117, 'train_samples_per_second': 10.127, 'train_steps_per_second': 1.266, 'total_flos': 383942507032512.0, 'train_loss': 0.06804031995137533, 'epoch': 2.0})

In [32]:
# save the model
trainer.save_model('pos_model')

Saving model checkpoint to pos_model
Configuration saved in pos_model/config.json
Model weights saved in pos_model/pytorch_model.bin
tokenizer config file saved in pos_model/tokenizer_config.json
Special tokens file saved in pos_model/special_tokens_map.json


In [33]:
# use the pipeline with our model
pos_tagger = pipeline('token-classification', model='pos_model')

loading configuration file pos_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "pos_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "VERB",
    "1": "PRT",
    "2": "ADV",
    "3": "NUM",
    "4": "ADJ",
    "5": "PRON",
    "6": "X",
    "7": "ADP",
    "8": "CONJ",
    "9": "DET",
    "10": "NOUN",
    "11": "."
  },
  "initializer_range": 0.02,
  "label2id": {
    ".": 11,
    "ADJ": 4,
    "ADP": 7,
    "ADV": 2,
    "CONJ": 8,
    "DET": 9,
    "NOUN": 10,
    "NUM": 3,
    "PRON": 5,
    "PRT": 1,
    "VERB": 0,
    "X": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "tr

In [34]:
# test the pipeline on a sentence
pos_tagger('Bill Gates was the CEO of Microsoft in Seattle, Washington.')

[{'entity': 'NOUN',
  'score': 0.99970156,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'NOUN',
  'score': 0.99974734,
  'index': 2,
  'word': 'Gates',
  'start': 5,
  'end': 10},
 {'entity': 'VERB',
  'score': 0.99977285,
  'index': 3,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity': 'DET',
  'score': 0.99989676,
  'index': 4,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity': 'NOUN',
  'score': 0.99971145,
  'index': 5,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity': 'ADP',
  'score': 0.99987173,
  'index': 6,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity': 'NOUN',
  'score': 0.99975616,
  'index': 7,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity': 'ADP',
  'score': 0.99984086,
  'index': 8,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity': 'NOUN',
  'score': 0.99981636,
  'index': 9,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity': '.',
  'score': 0.9998752,
  'index': 10,
  'word': ',',