# Setup

In [1]:
from os import path
import pandas as pd
from pprint import PrettyPrinter, pprint
from typing import Optional

__DIR__ = globals()['_dh'][0]
data_dir = path.relpath(path.join(__DIR__, "..", "_data"))

pp = PrettyPrinter(indent=2, width=120)

pd.set_option('display.width', 120)
pd.set_option('display.max_colwidth', 90)

In [2]:
# Settings
_colab_install = True
_testing = False

# Parameters
tokenizer_dir = path.join(data_dir, "pretrain", "tokenizer")
model_dir = path.join(data_dir, "pretrain", "model")
ner_dir = path.join(data_dir, "ner")

null_label = -100   # https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a
max_length = 128

n_train_eval_samps = 2
n_final_eval_samps = 10

training_args = dict(
    optim = "adamw_torch",
    num_train_epochs = 5,
    per_device_train_batch_size = 64,
    eval_accumulation_steps = 10,

    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    
    save_total_limit = 3,
    load_best_model_at_end = True,
)

## Process settings / parameters

In [3]:
if _testing:
    training_args.update(dict(
        num_train_epochs = 1
    ))

In [4]:
from collections import OrderedDict

if _colab_install:
    try:
        import google.colab
        
        colab_install_script = path.join(__DIR__, "..", "colab_install.sh")

        if not path.isfile(colab_install_script):
            script_url = "https://raw.githubusercontent.com/yenson-lau/pii-remediation/main/colab_install.sh"
            !wget $script_url -O $colab_install_script

        !bash $colab_install_script
        print()

    except ModuleNotFoundError:
        pass

config = OrderedDict(
    tokenizer_dir = tokenizer_dir,
    model_dir = model_dir,
    ner_dir = ner_dir,

    null_label = null_label,
    max_length = max_length,
    n_train_eval_samps = n_train_eval_samps,
    n_final_eval_samps = n_final_eval_samps,

    training_args = training_args,
)

print("NER finetuning on conllpp dataset:")
pp.pprint(config)

--2022-10-15 03:38:00--  https://raw.githubusercontent.com/yenson-lau/pii-remediation/main/colab_install.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 414 [text/plain]
Saving to: ‘/content/../colab_install.sh’


2022-10-15 03:38:00 (26.6 MB/s) - ‘/content/../colab_install.sh’ saved [414/414]

[K     |████████████████████████████████| 441 kB 29.0 MB/s 
[K     |████████████████████████████████| 72 kB 1.4 MB/s 
[K     |████████████████████████████████| 5.3 MB 68.9 MB/s 
[K     |████████████████████████████████| 7.6 MB 76.0 MB/s 
[K     |████████████████████████████████| 115 kB 101.1 MB/s 
[K     |████████████████████████████████| 212 kB 69.3 MB/s 
[K     |████████████████████████████████| 163 kB 62.3 MB/s 
[K     |████████████████████████████████| 

# Load / process dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("conllpp")
num_classes = dataset["train"].features["ner_tags"].feature.num_classes

display(pd.DataFrame(dataset["train"][:5]))

Downloading builder script:   0%|          | 0.00/8.73k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

Downloading and preparing dataset conllpp/conllpp (download: 4.63 MiB, generated: 9.78 MiB, post-processed: Unknown size, total: 14.41 MiB) to /root/.cache/huggingface/datasets/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/650k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conllpp downloaded and prepared to /root/.cache/huggingface/datasets/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, British, lamb, .]","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[5, 0]"
3,3,"[The, European, Commission, said, on, Thursday, it, disagreed, with, German, advice, t...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 1...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17, 11, 21, 17, 1...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,4,"[Germany, 's, representative, to, the, European, Union, 's, veterinary, committee, Wer...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 1...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0..."


In [6]:
import torch
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(tokenizer_dir)

spc_tok_attr = {"word_ids": [None], "labels": [null_label]}
cls_token = {**tokenizer(tokenizer.cls_token, add_special_tokens=False), **spc_tok_attr}
sep_token = {**tokenizer(tokenizer.sep_token, add_special_tokens=False), **spc_tok_attr}
pad_token = {**tokenizer(tokenizer.pad_token, add_special_tokens=False), **spc_tok_attr, "attention_mask": [0]}

def process_dataset(ds, null_label=null_label, max_length=max_length, add_special_tokens=True, num_proc=4):
    def process_sample(sample):
        encoding = tokenizer(sample["tokens"], add_special_tokens=False)
        
        # propagate word ids (based on words from sample["tokens"])
        encoding["word_ids"] = [[i] * len(input_ids) for i, input_ids in enumerate(encoding["input_ids"])]  

        # propagate ner tags as labels
        encoding["labels"] = [[tag] + [null_label] * (len(input_ids)-1) 
                              for tag, input_ids in zip(sample["ner_tags"], encoding["input_ids"])]

        # concat
        encoding = {k: sum(v, []) for k, v in encoding.items()}
        expected_encoding_length = len(encoding["input_ids"]) + (2 if add_special_tokens else 0)
        if max_length is not None:
            pad_length = max_length - expected_encoding_length

        for k, v in encoding.items():

            # append info from special_tokens
            if add_special_tokens:
                v = cls_token[k] + v + sep_token[k]

            # sanity check 1
            assert len(v) == expected_encoding_length, f"expected {k} of length {expected_encoding_length}, got {len(v)}"

            # padding / truncation
            if max_length is not None:
                v = v + pad_token[k] * pad_length if pad_length>0 else v[:max_length]

                # sanity check 2
                assert len(v) == max_length

            encoding[k] = v

        # provide concatenated text and a copy of the words
        encoding["words"] = sample["tokens"]
        encoding["text"] = tokenizer.decode(encoding["input_ids"], skip_special_tokens=True)
        
        return encoding

    return ds.map(process_sample, remove_columns=ds.features, num_proc=num_proc)

train_dataset = process_dataset(dataset["train"])
val_dataset = process_dataset(dataset["validation"])
test_dataset = process_dataset(dataset["test"])

display(pd.DataFrame(train_dataset[:5]))
display(pd.DataFrame(val_dataset[:5]))

       

#0:   0%|          | 0/3511 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/3510 [00:00<?, ?ex/s]

#2:   0%|          | 0/3510 [00:00<?, ?ex/s]

#3:   0%|          | 0/3510 [00:00<?, ?ex/s]

       

#0:   0%|          | 0/813 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/813 [00:00<?, ?ex/s]

#2:   0%|          | 0/812 [00:00<?, ?ex/s]

#3:   0%|          | 0/812 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/864 [00:00<?, ?ex/s]

   

#1:   0%|          | 0/863 [00:00<?, ?ex/s]

#2:   0%|          | 0/863 [00:00<?, ?ex/s]

#3:   0%|          | 0/863 [00:00<?, ?ex/s]

Unnamed: 0,input_ids,token_type_ids,attention_mask,word_ids,labels,words,text
0,"[2, 7946, 17106, 99, 940, 784, 184, 4743, 11828, 1030, 1418, 2616, 18, 3, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[None, 0, 1, 1, 2, 3, 4, 5, 5, 6, 7, 7, 8, None, None, None, None, None, None, None, N...","[-100, 3, 0, -100, 7, 0, 0, 0, -100, 7, 0, -100, 0, -100, -100, -100, -100, -100, -100...","[EU, rejects, German, call, to, boycott, British, lamb, .]",EU rejects German call to boycott British lamb.
1,"[2, 2471, 18509, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[None, 0, 1, None, None, None, None, None, None, None, None, None, None, None, None, N...","[-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -...","[Peter, Blackburn]",Peter Blackburn
2,"[2, 16054, 10456, 133, 12069, 133, 2097, 17, 9396, 17, 1529, 3, 0, 0, 0, 0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[None, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, None, None, None, None, None, None, None, None, N...","[-100, 5, -100, -100, -100, -100, 0, -100, -100, -100, -100, -100, -100, -100, -100, -...","[BRUSSELS, 1996-08-22]",BRUSSELS 1996 - 08 - 22
3,"[2, 199, 1731, 2686, 1377, 208, 14956, 335, 14230, 165, 226, 940, 9492, 184, 14520, 18...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, 16, 16, 17, 18, 19...","[-100, 0, 3, 4, 0, 0, 0, 0, 0, -100, 0, 7, 0, 0, 0, 0, 0, -100, 7, 0, -100, 0, 0, 0, 0...","[The, European, Commission, said, on, Thursday, it, disagreed, with, German, advice, t...",The European Commission said on Thursday it disagreed with German advice to consumers ...
4,"[2, 2020, 11, 87, 6087, 184, 162, 1731, 1913, 11, 87, 7326, 3445, 4982, 16738, 62, 141...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9, 10, 11, 11, 11, 12, 13, 14, 15, 16, 17, ...","[-100, 5, 0, -100, 0, 0, 0, 3, 4, 0, -100, 0, -100, 0, 1, 2, -100, -100, 0, 0, 0, 0, 0...","[Germany, 's, representative, to, the, European, Union, 's, veterinary, committee, Wer...",Germany's representative to the European Union's veterinary committee Werner Zwingmann...


Unnamed: 0,input_ids,token_type_ids,attention_mask,word_ids,labels,words,text
0,"[2, 12107, 6485, 147, 9111, 17, 48, 127, 6485, 5356, 8277, 8785, 137, 17436, 127, 1564...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[None, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8...","[-100, 0, -100, -100, -100, 0, 3, -100, -100, -100, -100, -100, -100, -100, -100, 0, -...","[CRICKET, -, LEICESTERSHIRE, TAKE, OVER, AT, TOP, AFTER, INNINGS, VICTORY, .]",CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY.
1,"[2, 48, 10747, 151, 10747, 2097, 17, 9396, 17, 1224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[None, 0, 0, 0, 0, 1, 1, 1, 1, 1, None, None, None, None, None, None, None, None, None...","[-100, 5, -100, -100, -100, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -...","[LONDON, 1996-08-30]",LONDON 1996 - 08 - 30
2,"[2, 1185, 1581, 475, 17, 1651, 161, 1479, 18683, 1245, 794, 204, 3938, 208, 9469, 216,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[None, 0, 1, 2, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, ...","[-100, 7, 8, 0, -100, -100, -100, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, -100, 0, 3, 0, 0, 0, 0...","[West, Indian, all-rounder, Phil, Simmons, took, four, for, 38, on, Friday, as, Leices...",West Indian all - rounder Phil Simmons took four for 38 on Friday as Leicestershire be...
3,"[2, 3742, 6056, 208, 1438, 16, 1608, 16, 977, 235, 1483, 17, 2959, 216, 1614, 12720, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0...","[Their, stay, on, top, ,, though, ,, may, be, short-lived, as, title, rivals, Essex, ,...","Their stay on top, though, may be short - lived as title rivals Essex, Derbyshire and ..."
4,"[2, 845, 17067, 11769, 522, 204, 7395, 208, 162, 3354, 5985, 237, 10007, 2041, 16, 149...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, 16, 17, 18, 19, 20, 2...","[-100, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 3, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[After, bowling, Somerset, out, for, 83, on, the, opening, morning, at, Grace, Road, ,...","After bowling Somerset out for 83 on the opening morning at Grace Road, Leicestershire..."


# NER finetuning

In [7]:
# For evaluation
import numpy as np

np.random.seed(0)

eval_dataset = val_dataset

def get_entities(pred, sample):
    valid_sequence_length = sum(sample["attention_mask"]) - 2   # subtract cls / sep tokens
    pred = torch.argmax(pred, axis=-1).flatten()[1:valid_sequence_length+1]
    pred_idxs = torch.nonzero(pred).flatten()

    words = {i: word for i, word in enumerate(sample["words"])}
    word_ids = sample["word_ids"][1:valid_sequence_length+1]

    entities = OrderedDict()
    for idx in pred_idxs:
        word = words.get(word_ids[idx], "[INV]")
        entities[word] = entities.get(word, []) + [int(pred[idx])]

    return entities

def eval_random_samps(eval_preds, n_samps=n_train_eval_samps):
    preds = eval_preds.predictions

    print("\nEVALUATING ON RANDOM SAMPLES:\n")
    for idx in np.random.permutation(len(preds))[:n_samps]:
        sample = eval_dataset[int(idx)]
        pp.pprint(sample["text"])
        pp.pprint(get_entities(torch.from_numpy(preds[idx]), sample))
        print()

    return dict()

In [8]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments, TrainerCallback

model = BertForTokenClassification.from_pretrained(path.join(data_dir, "pretrain", "model"), num_labels=num_classes)

train_args = TrainingArguments(output_dir = ner_dir,
                                  overwrite_output_dir = True,
                                  **training_args)

trainer = Trainer(model = model,
                  args = train_args,
                  compute_metrics=eval_random_samps,
                  train_dataset = train_dataset,
                  eval_dataset = eval_dataset)

Some weights of the model checkpoint at ../_data/pretrain/model were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../_data/pretrain/model and are n

In [9]:
trainer.train()
trainer.save_model(ner_dir)

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, word_ids, words. If text, word_ids, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14041
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1100


Epoch,Training Loss,Validation Loss
1,0.1323,0.075059
2,0.036,0.061989
3,0.0156,0.064368
4,0.0079,0.06631
5,0.0043,0.066182


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, word_ids, words. If text, word_ids, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8



EVALUATING ON RANDOM SAMPLES:

'World Group II, first round ( March 1 - 2 )'
OrderedDict([('World', [3]), ('Group', [8]), ('II', [8])])

'M. Maynard run out 1'
OrderedDict([('M.', [1, 1]), ('Maynard', [2, 2, 2])])

'Manchester City 3 1 0 2 2 3 3'
OrderedDict([('Manchester', [3]), ('City', [4])])

'NFL AMERICAN FOOTBALL - RANDALL CUNNINGHAM RETIRES.'
OrderedDict()

("The detention of veteran dissident Wang Donghai showed China's determination to crush any vestige of dissent during "
 "the current profound transitions in the nation's leadership, a human rights activist said on Saturday.")
OrderedDict([('Wang', [1]), ('Donghai', [2, 2]), ('China', [5])])



Saving model checkpoint to ../_data/ner/checkpoint-220
Configuration saved in ../_data/ner/checkpoint-220/config.json
Model weights saved in ../_data/ner/checkpoint-220/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, word_ids, words. If text, word_ids, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8



EVALUATING ON RANDOM SAMPLES:

'Playing Sunday : Queens Park Rangers v Bolton'
OrderedDict([('Queens', [3]), ('Park', [4]), ('Rangers', [4]), ('Bolton', [3])])

('Federal Reserve governor Lawrence Lindsey, speaking on U. S. cable television network CNBC, said the U. S. economy '
 'appears on balance to be a bit strong, adding the central bank would not curb growth provided inflation remains in '
 'check.')
OrderedDict([ ('Federal', [3]),
              ('Reserve', [4]),
              ('Lawrence', [1]),
              ('Lindsey', [2, 2]),
              ('U.S.', [5, 6, 5, 6]),
              ('CNBC', [3, 6])])

'Obilic 4 4 0 0 10 1 12'
OrderedDict([('Obilic', [3, 4, 4])])

('" The film spares neither the Irish nor the British in its depiction of the savagery of the time, " Jordan said in a '
 'statement released by Warner Bros. "')
OrderedDict([('Irish', [7]), ('British', [7]), ('Jordan', [5]), ('Warner', [3]), ('Bros', [6])])

('" The committee will meet following a complaint by Israel ov

Saving model checkpoint to ../_data/ner/checkpoint-440
Configuration saved in ../_data/ner/checkpoint-440/config.json
Model weights saved in ../_data/ner/checkpoint-440/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, word_ids, words. If text, word_ids, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8



EVALUATING ON RANDOM SAMPLES:

'1822 - Brazil proclaimed independence from Portugal and Pedro I became first Emperor of Brazil in December 1822.'
OrderedDict([('Brazil', [5, 6]), ('Portugal', [5]), ('Pedro', [1])])

'M. Maynard run out 1'
OrderedDict([('M.', [1, 1]), ('Maynard', [2, 2])])

'" Collins would never be a proponent of contemporary terrorism as practised today.'
OrderedDict([('Collins', [1])])

('" We think that the Afrikaner model within this new, multi - ethnic society of South Africa will have to develop '
 'experimentally with world thinking in this regard. "')
OrderedDict([('Afrikaner', [7, 8, 8]), ('South', [5]), ('Africa', [6])])

'LONDON 1996 - 08 - 31'
OrderedDict([('LONDON', [5, 6, 6, 6])])



Saving model checkpoint to ../_data/ner/checkpoint-660
Configuration saved in ../_data/ner/checkpoint-660/config.json
Model weights saved in ../_data/ner/checkpoint-660/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, word_ids, words. If text, word_ids, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8



EVALUATING ON RANDOM SAMPLES:

'He declined to give any further information.'
OrderedDict()

'Oldham 3 Ipswich 3'
OrderedDict([('Oldham', [3, 4]), ('Ipswich', [3])])

'I think I played really bad.'
OrderedDict()

'CHISINAU, Moldova 1996 - 08 - 31'
OrderedDict([('CHISINAU', [5, 6, 6, 6, 6]), ('Moldova', [5, 6])])

'SOCCER - ENGLISH LEAGUE STANDINGS.'
OrderedDict([('ENGLISH', [7]), ('STANDINGS', [8])])



Saving model checkpoint to ../_data/ner/checkpoint-880
Configuration saved in ../_data/ner/checkpoint-880/config.json
Model weights saved in ../_data/ner/checkpoint-880/pytorch_model.bin
Deleting older checkpoint [../_data/ner/checkpoint-220] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: text, word_ids, words. If text, word_ids, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8



EVALUATING ON RANDOM SAMPLES:

"Magna's traditional strength has been instrument panels, door panels and other interior components."
OrderedDict([('Magna', [3])])

'No trace of two missing teenagers in Belgium.'
OrderedDict([('Belgium', [5])])

('" He developed techniques of guerilla warfare later copied by independence movements around the world, from Mao Tse '
 '- Tung in China to Yitzak Shamir in Israel, " Jordan said.')
OrderedDict([ ('Mao', [1]),
              ('Tse-Tung', [2, 2, 2, 2, 2]),
              ('China', [5]),
              ('Yitzak', [7, 2, 2]),
              ('Shamir', [2, 4]),
              ('Israel', [5]),
              ('Jordan', [5])])

('Italian Foreign Minister Lamberto Dini on Saturday met former Tanzanian president Julius Nyerere, the international '
 'negotiator for Burundi, the ministry said.')
OrderedDict([ ('Italian', [7]),
              ('Lamberto', [1, 2]),
              ('Dini', [2, 2]),
              ('Tanzanian', [7]),
              ('Julius', [1]),
 

Saving model checkpoint to ../_data/ner/checkpoint-1100
Configuration saved in ../_data/ner/checkpoint-1100/config.json
Model weights saved in ../_data/ner/checkpoint-1100/pytorch_model.bin
Deleting older checkpoint [../_data/ner/checkpoint-660] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ../_data/ner/checkpoint-440 (score: 0.061989229172468185).
Saving model checkpoint to ../_data/ner
Configuration saved in ../_data/ner/config.json
Model weights saved in ../_data/ner/pytorch_model.bin


# Evaluation

In [13]:
np.random.seed(0)

samples = np.random.permutation(len(val_dataset))[:n_final_eval_samps]
samples = [val_dataset[int(i)] for i in samples]

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

def to_tensor(sample):
    return torch.tensor(sample).view(1,-1).to(device)

preds = [model(input_ids=to_tensor(sample["input_ids"]),
               attention_mask=to_tensor(sample["attention_mask"]),
               token_type_ids=to_tensor(sample["token_type_ids"])
               ).logits.cpu() for sample in samples]

pred_ents = [get_entities(pred, sample) for pred, sample in zip(preds, samples)]

In [15]:
for sample, pred in zip(samples, pred_ents):
    pp.pprint(sample["text"])
    pp.pprint(pred)
    print()

('Derbyshire, nine - wicket winners over Worcestershire, and Surrey, who thrashed Warwickshire by an innings and 164 '
 'runs, can instead take the day off along with rivals Leicestershire, who beat Somerset inside two days.')
OrderedDict([ ('Derbyshire', [3]),
              ('Worcestershire', [3, 6]),
              ('Surrey', [3]),
              ('Warwickshire', [3, 4]),
              ('Leicestershire', [3, 4]),
              ('Somerset', [3])])

'Fulham 4 3 0 1 5 3 9'
OrderedDict([('Fulham', [3, 4])])

'Mahala is a Moslem village on Bosnian Serb republic territory.'
OrderedDict([('Mahala', [5, 5]), ('Moslem', [7, 8]), ('Bosnian', [7, 8]), ('Serb', [8])])

('Nyerere arrived in Rome this week on a private visit and held talks with the U. S. special envoy to Burundi, Howard '
 "Wolpe, and the Sant'Egidio Community, an Italian Roman Catholic organisation which has been monitoring Burundi "
 'closely.')
OrderedDict([ ('Nyerere', [1, 4]),
              ('Rome', [5]),
              ('U.S.',