The task is to explore named entity recognition pipeline in Transformers.

In [1]:
import pickle
from transformers import pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# load the pipeline
ner = pipeline('ner', aggregation_strategy='simple')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
# download CoNLL 2003 datasets
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_train.pkl
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_test.pkl

File ‘ner_train.pkl’ already there; not retrieving.

File ‘ner_test.pkl’ already there; not retrieving.



In [4]:
# create train and test corpora
with open('ner_train.pkl', 'rb') as f:
    corpus_train = pickle.load(f)
    
with open('ner_test.pkl', 'rb') as f:
    corpus_test = pickle.load(f)

In [5]:
corpus_test[0]

[('CRICKET', 'O'),
 ('-', 'O'),
 ('LEICESTERSHIRE', 'B-ORG'),
 ('TAKE', 'O'),
 ('OVER', 'O'),
 ('AT', 'O'),
 ('TOP', 'O'),
 ('AFTER', 'O'),
 ('INNINGS', 'O'),
 ('VICTORY', 'O'),
 ('.', 'O')]

In [6]:
# split tags and words
inputs = []
targets = []

for sentence_tag_pairs in corpus_test:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [7]:
inputs[5]

['Essex',
 ',',
 'however',
 ',',
 'look',
 'certain',
 'to',
 'regain',
 'their',
 'top',
 'spot',
 'after',
 'Nasser',
 'Hussain',
 'and',
 'Peter',
 'Such',
 'gave',
 'them',
 'a',
 'firm',
 'grip',
 'on',
 'their',
 'match',
 'against',
 'Yorkshire',
 'at',
 'Headingley',
 '.']

In [8]:
# use detokenizer to make the whole sentence
detokenizer = TreebankWordDetokenizer()

In [9]:
detokenizer.detokenize(inputs[5])

'Essex, however, look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire at Headingley.'

In [10]:
targets[5]

['B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'B-LOC',
 'O']

In [11]:
# use the pipeline
ner(detokenizer.detokenize(inputs[5]))

[{'entity_group': 'ORG',
  'score': 0.99925417,
  'word': 'Essex',
  'start': 0,
  'end': 5},
 {'entity_group': 'PER',
  'score': 0.9996957,
  'word': 'Nasser Hussain',
  'start': 60,
  'end': 74},
 {'entity_group': 'PER',
  'score': 0.999509,
  'word': 'Peter Such',
  'start': 79,
  'end': 89},
 {'entity_group': 'ORG',
  'score': 0.9989281,
  'word': 'Yorkshire',
  'start': 135,
  'end': 144},
 {'entity_group': 'LOC',
  'score': 0.99204224,
  'word': 'Headingley',
  'start': 148,
  'end': 158}]

In [12]:
# create a function to map pipeline result to list of tags for later performance assessment
def compute_prediction(tokens, input_, ner_result):
    # tokens is the original tokenized sentence, input_ is the detokenized string
    predicted_tags = []
    # keep track of state
    state = 'O'
    current_index = 0
    for token in tokens:
        # find the token in the input_ (should be at or near the start)
        index = input_.find(token)
        assert(index >= 0)
        current_index += index
        # check if this index belongs to an entity and assign label
        tag = 'O'
        for entity in ner_result:
            if current_index >= entity['start'] and current_index < entity['end']:
                # this token belongs to an entity
                if state == 'O':
                    state = 'B'
                else:
                    state = 'I'
                tag = f"{state}-{entity['entity_group']}"
                break
        if tag == "O":
            # reset the state
            state = "O"
        predicted_tags.append(tag)
        # remove the token from input_
        input_ = input_[index + len(token):]
        # update current_index
        current_index += len(token)
    # sanity check    
    assert(len(predicted_tags) == len(tokens))
    return predicted_tags

In [13]:
input_ = detokenizer.detokenize(inputs[5])
ner_result = ner(input_)
pred_tags = compute_prediction(inputs[5], input_, ner_result)

In [14]:
accuracy_score(targets[5], pred_tags)

1.0

In [15]:
for target, prediction in zip(targets[5], pred_tags):
    print(target, prediction)

B-ORG B-ORG
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
B-PER B-PER
I-PER I-PER
O O
B-PER B-PER
I-PER I-PER
O O
O O
O O
O O
O O
O O
O O
O O
O O
B-ORG B-ORG
O O
B-LOC B-LOC
O O


In [16]:
# get detokenized inputs to pass into the model
detok_inputs = []
for tokens in inputs:
    text = detokenizer.detokenize(tokens)
    detok_inputs.append(text)

In [17]:
ner_results = ner(detok_inputs)

In [18]:
predictions = []
for tokens, text, ner_result in zip(inputs, detok_inputs, ner_results):
    pred = compute_prediction(tokens, text, ner_result)
    predictions.append(pred)

In [19]:
def flatten(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [20]:
# flatten targets and predictions to calculate accuracy and F1
flat_predictions = flatten(predictions)
flat_targets = flatten(targets)

In [21]:
accuracy_score(flat_targets, flat_predictions)

0.9916563354782848

In [22]:
f1_score(flat_targets, flat_predictions, average='macro')

0.95403328229255