In [1]:
def get_index(cur_entry, tokens):
    ans = 0
    for i, tok in enumerate(tokens[:cur_entry['index']-1]):  # -1 because we start with [CLS]
        ans += len(tok.replace('#',''))  
        if i:
            ans += int(tok[0].isalpha())  # whitespace before every word (not subword or punctuation) except the first
        
    ans += cur_entry['word'][0].isalpha() # whitespace before our token (unless it's a subword)
    return ans

# Contains a dirty hack but oh well
def post_process(item):
    if item['word'].startswith('#'):
        item['entity'] = 'I' + item['entity'][1:]
    if item['word'].startswith('##'):
        item['entity'] = 'I' + item['entity'][1:]
    return item

def insert_index(item, tokens):
    item['start'] = get_index(item, tokens)  
    item['end'] = item['start']+len(item['word'].replace('#',''))
    return item

In [2]:
def is_first_word(token):
    return len(token) > 1 and token[0].isupper()

In [3]:
import string
def bio_tokens_to_text(bio_tokens):
    res = ''
    prev_token = ''
    for token in bio_tokens:
        if token in string.punctuation: # Hoi,
            res += token
        elif prev_token in string.punctuation and not is_first_word(token): # P.C. Hooft
            res += token
        else:
            res += ' '+token
        prev_token = token
    return res

In [4]:
'0' in string.punctuation

False

In [5]:
import spacy
from spacy import displacy
import seaborn as sns

def visualise(text, preds):
    ## Step 1: adding entities
    entities = []
    nlp = spacy.blank("nl")  # it should work with any language
    doc = nlp(text+' ') # a hack
    
    
    ner_map = {} 
    cur_type = ''
    cur_start, cur_end = 0, 0

    for pred in preds: 
        ent = pred['entity']
        if ent.startswith('B'): # or pred['start'] > cur_end+1: ## a dirty hack in case it failed to predict 'B'
            ## Adding the previous entity if it's not empty
            if cur_type != '':
                char_span = doc.char_span(cur_start, cur_end, cur_type, alignment_mode='expand')
                if char_span:
                    entities.append(char_span)

            ## Processing the new entity
            cur_type = ent[2:]
            if cur_type not in ner_map: 
                ner_map[cur_type] = len(ner_map)+1
            cur_start = pred['start']
            cur_end = pred['end']
        else: ## there's only 'B' and 'I', 'O' is not included
            cur_end = pred['end']

    ## Adding the last one
    if cur_type != '':
        char_span = doc.char_span(cur_start, cur_end, cur_type, alignment_mode='expand')
        if char_span:
            entities.append(char_span)
            
    doc.ents = entities
    
    ## Step 2: visualising 
    colours = sns.color_palette("Set2", len(ner_map)).as_hex()
    options = {"ents": list(ner_map.keys()),
               "colors": {ent: colours[ner_map[ent]-1] for ent in ner_map.keys()}
              }

    displacy_html = displacy.render(doc, style="ent", options=options,jupyter=True)

In [6]:
model_names = {'GysBERT': "/ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2",
              'BERTje': "/ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v2",
              'BERT-multi-cased': "/ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2",
              'WikiNEuRal': "Babelscape/wikineural-multilingual-ner"}

In [7]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification


tokenizer = AutoTokenizer.from_pretrained(model_names['BERTje'])
model = AutoModelForTokenClassification.from_pretrained(model_names['BERTje'])
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

example = "Ik ben Jip de Kip, en ik woon in Zandvoort aan Zee"

ner_results = nlp(example)
res = [post_process(item) for item in ner_results]
tokens = [tokenizer.decode(tok) for tok in tokenizer(example).input_ids][1:-1]
res_for_visualisation = [insert_index(item,tokens) for item in res]
visualise(example, res_for_visualisation)

In [5]:
from data_utils import prepare_data, convert_to_dataset

tests = 'test_NHA.txt  test_RHC.txt  test_SA.txt  test_VOC.txt'.split()
# train and val are redundant but we need the labels, sooo
train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')

tests_prepared = [
     prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/'+test) for test in tests
]

label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
tests_prepared = [
     prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/'+test) for test in tests
]
test_data = [convert_to_dataset(test, label_map)
             for test in tests_prepared]

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

In [7]:
test_example = test_data[1][0]

tokenized_inputs = tokenizer(
        test_example["tokens"], truncation=True, is_split_into_words=True, padding=True, return_tensors='pt'
    )

preds = model(tokenized_inputs['input_ids'])

In [20]:
example = bio_tokens_to_text(test_example["tokens"])

In [22]:
example

'R o Caze 3.2.17 ontvangen met de verhoging 17.02 agt guldens,twee Stuivers 07 8:2:2:en twaalf penningen. De Swart dens boven de Lusten Verzoeke van de H.r D Surangar nog een Gezegelde Kwitie by het Mandaat,wegens betaalde Staats bladen,staats Couranten Brieveport Eta in dato 13e feb:1815 ad \x8370 8. En dito van 31 Maart 1815 wegens de Bydragen ad 5 8. Gedaan en gepasseerd voor den voornoemden Notaris Mr.D. Suringar op den drie en Twintigsten December achttien Hondert veertien te Duurswold ten huize van Jan Binser Beets ma in tegenswoordigheid van de Heer Bernardus Brunes ma predikant te Wytyctem en Eeuwe Lieuwer van Leeuwe deurwaarder te Beetsterzwaag getuigen welke deeze benevens de Comparanten de weer schout deeze gemeente en den Notaris onder wien deeze verbleven is hebben getekend B: Brandsma derde en laatste Ten tweeden, Ida Alyda Berns,meerder jarig;ten derden, Hendrika Berns,meerderjarig,de twee laatstgenoemden zonder Beroep woonende te Huissen, Dochters van genoemden Peter Be

In [26]:
ner_results = nlp(example)
res = [post_process(item) for item in ner_results]
tokens = [tokenizer.decode(tok) for tok in tokenizer(example).input_ids][1:-1]
res_for_visualisation = [insert_index(item,tokens) for item in res]
visualise(example, res_for_visualisation)

In [27]:
ner_results

[{'entity': 'I-LOC',
  'score': 0.52973866,
  'index': 48,
  'word': 'Lust',
  'start': 126,
  'end': 130},
 {'entity': 'B-PER',
  'score': 0.6601863,
  'index': 58,
  'word': 'D',
  'start': 154,
  'end': 155},
 {'entity': 'I-PER',
  'score': 0.8020663,
  'index': 59,
  'word': 'Sur',
  'start': 156,
  'end': 159},
 {'entity': 'I-PER',
  'score': 0.48470655,
  'index': 60,
  'word': '##anga',
  'start': 159,
  'end': 163},
 {'entity': 'I-PER',
  'score': 0.57316226,
  'index': 61,
  'word': '##r',
  'start': 163,
  'end': 164},
 {'entity': 'B-TIME',
  'score': 0.9579203,
  'index': 90,
  'word': '13',
  'start': 276,
  'end': 278},
 {'entity': 'I-TIME',
  'score': 0.9172103,
  'index': 91,
  'word': '##e',
  'start': 278,
  'end': 279},
 {'entity': 'I-TIME',
  'score': 0.9334394,
  'index': 92,
  'word': 'f',
  'start': 280,
  'end': 281},
 {'entity': 'I-TIME',
  'score': 0.90537804,
  'index': 93,
  'word': '##e',
  'start': 281,
  'end': 282},
 {'entity': 'I-TIME',
  'score': 0.9261

In [28]:
res_for_visualisation

[{'entity': 'I-LOC',
  'score': 0.52973866,
  'index': 48,
  'word': 'Lust',
  'start': 126,
  'end': 130},
 {'entity': 'B-PER',
  'score': 0.6601863,
  'index': 58,
  'word': 'D',
  'start': 154,
  'end': 155},
 {'entity': 'I-PER',
  'score': 0.8020663,
  'index': 59,
  'word': 'Sur',
  'start': 156,
  'end': 159},
 {'entity': 'I-PER',
  'score': 0.48470655,
  'index': 60,
  'word': '##anga',
  'start': 159,
  'end': 163},
 {'entity': 'I-PER',
  'score': 0.57316226,
  'index': 61,
  'word': '##r',
  'start': 163,
  'end': 164},
 {'entity': 'B-TIME',
  'score': 0.9579203,
  'index': 90,
  'word': '13',
  'start': 276,
  'end': 278},
 {'entity': 'I-TIME',
  'score': 0.9172103,
  'index': 91,
  'word': '##e',
  'start': 278,
  'end': 279},
 {'entity': 'I-TIME',
  'score': 0.9334394,
  'index': 92,
  'word': 'f',
  'start': 280,
  'end': 281},
 {'entity': 'I-TIME',
  'score': 0.90537804,
  'index': 93,
  'word': '##e',
  'start': 281,
  'end': 282},
 {'entity': 'I-TIME',
  'score': 0.9261

In [9]:
tokens = [tokenizer.decode(tok) for tok in tokenized_inputs['input_ids'][0]][1:-1]

['R',
 'o',
 'Ca',
 '##ze',
 '3',
 '.',
 '2',
 '.',
 '17',
 'ontvangen',
 'met',
 'de',
 'verhoging',
 '17',
 '.',
 '02',
 'a',
 '##g',
 '##t',
 'gulden',
 '##s',
 ',',
 'twee',
 'Stu',
 '##ive',
 '##r',
 '##s',
 '07',
 '8',
 ':',
 '2',
 ':',
 '2',
 ':',
 'en',
 'twaalf',
 'pe',
 '##nning',
 '##en',
 '.',
 'De',
 'Swart',
 'de',
 '##n',
 '##s',
 'boven',
 'de',
 'Lust',
 '##en',
 'Ver',
 '##zoek',
 '##e',
 'van',
 'de',
 'H',
 '.',
 'r',
 'D',
 'Sur',
 '##anga',
 '##r',
 'nog',
 'een',
 'Geze',
 '##geld',
 '##e',
 'Kwi',
 '##tie',
 'by',
 'het',
 'Man',
 '##da',
 '##at',
 ',',
 'wegens',
 'betaalde',
 'Staats',
 'bladen',
 ',',
 'staats',
 'Courant',
 '##en',
 'Bri',
 '##eve',
 '##port',
 'Eta',
 'in',
 'dat',
 '##o',
 '13',
 '##e',
 'f',
 '##e',
 '##b',
 ':',
 '18',
 '##15',
 'a',
 '##d',
 '70',
 '8',
 '.',
 'En',
 'dit',
 '##o',
 'van',
 '31',
 'Maar',
 '##t',
 '18',
 '##15',
 'wegens',
 'de',
 'By',
 '##dragen',
 'a',
 '##d',
 '5',
 '8',
 '.',
 'Gedaan',
 'en',
 'gepasseerd',
 'voor

In [45]:
preds_clean = [np.argmax(pred.detach().numpy()) for pred in preds.logits[0]]

In [49]:
labels = test_example['ner_tags']

In [43]:
np.argmax(preds.logits[0][0].detach().numpy())

6

In [41]:
tokenized_inputs[0][0]

tensor(1)

In [38]:
len(tokenized_inputs[0])

394

In [36]:
len(preds.logits[0])

394

In [None]:
preds

In [30]:
import numpy as np
preds = np.argmax(preds)

In [31]:
preds

0

In [18]:
tests_prepared[1][0]

[['R', 'O'],
 ['o', 'O'],
 ['Caze', 'O'],
 ['3', 'O'],
 ['.', 'O'],
 ['2', 'O'],
 ['.', 'O'],
 ['17', 'O'],
 ['ontvangen', 'O'],
 ['met', 'O'],
 ['de', 'O'],
 ['verhoging', 'O'],
 ['17', 'O'],
 ['.', 'O'],
 ['02', 'O'],
 ['agt', 'O'],
 ['guldens', 'O'],
 [',', 'O'],
 ['twee', 'O'],
 ['Stuivers', 'O'],
 ['07', 'O'],
 ['8', 'O'],
 [':', 'O'],
 ['2', 'O'],
 [':', 'O'],
 ['2', 'O'],
 [':', 'O'],
 ['en', 'O'],
 ['twaalf', 'O'],
 ['penningen', 'O'],
 ['.', 'O'],
 ['De', 'B-PER'],
 ['Swart', 'I-PER'],
 ['dens', 'O'],
 ['boven', 'O'],
 ['de', 'O'],
 ['Lusten', 'O'],
 ['Verzoeke', 'O'],
 ['van', 'O'],
 ['de', 'O'],
 ['H', 'O'],
 ['.', 'O'],
 ['r', 'O'],
 ['D', 'B-PER'],
 ['Surangar', 'I-PER'],
 ['nog', 'O'],
 ['een', 'O'],
 ['Gezegelde', 'O'],
 ['Kwitie', 'O'],
 ['by', 'O'],
 ['het', 'O'],
 ['Mandaat', 'O'],
 [',', 'O'],
 ['wegens', 'O'],
 ['betaalde', 'O'],
 ['Staats', 'O'],
 ['bladen', 'O'],
 [',', 'O'],
 ['staats', 'O'],
 ['Couranten', 'O'],
 ['Brieveport', 'O'],
 ['Eta', 'O'],
 ['in', 'O'],

In [283]:
res_for_visualisation

[{'entity': 'B-PER',
  'score': 0.97226703,
  'index': 3,
  'word': 'Ji',
  'start': 7,
  'end': 9},
 {'entity': 'I-PER',
  'score': 0.88010025,
  'index': 4,
  'word': '##p',
  'start': 9,
  'end': 10},
 {'entity': 'I-PER',
  'score': 0.9402657,
  'index': 5,
  'word': 'de',
  'start': 11,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.9643208,
  'index': 6,
  'word': 'Kip',
  'start': 14,
  'end': 17},
 {'entity': 'B-LOC',
  'score': 0.9677931,
  'index': 12,
  'word': 'De',
  'start': 33,
  'end': 35},
 {'entity': 'I-LOC',
  'score': 0.5306217,
  'index': 13,
  'word': '##n',
  'start': 35,
  'end': 36},
 {'entity': 'B-LOC',
  'score': 0.6800242,
  'index': 14,
  'word': 'Haag',
  'start': 37,
  'end': 41}]

In [269]:
example

'Ik ben Jip de Kip, en ik woon in Zandvoort aan Zee'

In [270]:
for item in res_for_visualisation:
    if example[get_index(item,tokens)] != item['word'].replace('#','')[0]:
        print(get_index(item,tokens), example[get_index(item,tokens)], '!=', item['word'].replace('#','')[0])
#     if example[item['start']:item['end']-1] != item['word'].replace('#',''):
#         print(example[item['start']:item['end']-1],'!=', item['word'].replace('#',''))

Ik
Current start:  2
ben
Current start:  6
Ik
Current start:  2
ben
Current start:  6
Ji
Current start:  9
Ik
Current start:  2
ben
Current start:  6
Ji
Current start:  9
##p
Current start:  10
Ik
Current start:  2
ben
Current start:  6
Ji
Current start:  9
##p
Current start:  10
de
Current start:  13
Ik
Current start:  2
ben
Current start:  6
Ji
Current start:  9
##p
Current start:  10
de
Current start:  13
Kip
Current start:  17
,
Current start:  18
en
Current start:  21
ik
Current start:  24
woon
Current start:  29
in
Current start:  32
Ik
Current start:  2
ben
Current start:  6
Ji
Current start:  9
##p
Current start:  10
de
Current start:  13
Kip
Current start:  17
,
Current start:  18
en
Current start:  21
ik
Current start:  24
woon
Current start:  29
in
Current start:  32
Zandvoort
Current start:  42
Ik
Current start:  2
ben
Current start:  6
Ji
Current start:  9
##p
Current start:  10
de
Current start:  13
Kip
Current start:  17
,
Current start:  18
en
Current start:  21
ik
Curr

In [252]:
'I'.isalpha()

True

In [251]:
tokens

['Ik',
 'ben',
 'Ji',
 '##p',
 'de',
 'Kip',
 ',',
 'en',
 'ik',
 'woon',
 'in',
 'Zandvoort',
 'aan',
 'Ze',
 '##e']