In [8]:
tests = 'test_NHA.txt  test_RHC.txt  test_SA.txt  test_VOC.txt'.split()
tests

['test_NHA.txt', 'test_RHC.txt', 'test_SA.txt', 'test_VOC.txt']

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from data_utils import prepare_data, convert_to_dataset

# train and val are redundant but we need the labels, sooo
train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')

tests_prepared = [
     prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/'+test) for test in tests
]

label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
id2label_original = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = [convert_to_dataset(test, label_map)
             for test in tests_prepared]

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

In [13]:
dct = {
    "train": train_data,
    "validation": val_data}
for i, test in enumerate(tests):
    dct[test.split('.')[0]] = test_data[i]

datasets = DatasetDict(dct)

In [None]:
def get_wrong_predictions(eval_prediction): # experimental, will be improved
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label_original[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    res = []
    for i in range(len(true_predictions)):
        if true_predictions[i] != true_labels[i]:
            res.append((i, true_predictions[i],true_labels[i]))
    return res

In [2]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)
#     print(predictions) # todo remove

    # for evaluating wikineural, which has a different number of labels:
    # set the extra labels to 'O' and hope for the best
#     filtered_predictions = [
#         [p if p < len(label_list) else label2id['O'] for p in prediction]
#         for prediction in predictions
#     ]
    
#     predictions = filtered_predictions

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label_original[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # todo: remove debug output
#     print(true_predictions[0])
#     print(true_labels[0])

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True, return_tensors='pt'
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


In [3]:
model_names = {'GysBERT': "/ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2",
              'BERTje': "/ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v2",
              'BERT-multi-cased': "/ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2",
              'WikiNEuRal': "Babelscape/wikineural-multilingual-ner"}

In [15]:
def evaluate_with_trainer(model_name):
# if True:
#     model_name = model_names['BERTje']
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    global model
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
    
    global id2label
    id2label = model.config.id2label
    global label2id
    label2id = model.config.label2id
    
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    res = {}
    for test in tests:
        dataset_name = test.split('.')[0]
        preds = trainer.predict(tokenized_datasets[dataset_name])
        res[test] = preds.metrics
    return res

In [16]:
evaluate_with_trainer(model_names['BERT-multi-cased'])

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 5 2 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 ... 6 6 6]
 [6 6 1 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 1 4 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 2 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


{'test_NHA.txt': {'test_loss': 0.11342580616474152,
  'test_precision': 0.709470304975923,
  'test_recall': 0.6567607726597325,
  'test_f1': 0.6820987654320987,
  'test_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.80      0.74      0.77       236\n         PER       0.77      0.72      0.74       336\n        TIME       0.29      0.26      0.27       101\n\n   micro avg       0.71      0.66      0.68       673\n   macro avg       0.62      0.57      0.59       673\nweighted avg       0.71      0.66      0.68       673\n',
  'test_runtime': 1.7023,
  'test_samples_per_second': 29.959,
  'test_steps_per_second': 4.112},
 'test_RHC.txt': {'test_loss': 0.3024003505706787,
  'test_precision': 0.8095238095238095,
  'test_recall': 0.7555555555555555,
  'test_f1': 0.7816091954022989,
  'test_classification_report': '              precision    recall  f1-score   support\n\n         LOC       1.00      0.82      0.90        17\n         P

In [6]:
evaluate_with_trainer(model_names['WikiNEuRal'])

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[0 5 0 ... 0 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 11.0033540725708,
 'test_precision': 0.2912371134020619,
 'test_recall': 0.29161290322580646,
 'test_f1': 0.29142488716956805,
 'test_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.48      0.47      0.48       312\n        MISC       0.00      0.00      0.00         0\n         ORG       0.00      0.00      0.00         0\n         PER       0.35      0.28      0.31       283\n        TIME       0.00      0.00      0.00       180\n\n   micro avg       0.29      0.29      0.29       775\n   macro avg       0.17      0.15      0.16       775\nweighted avg       0.32      0.29      0.30       775\n',
 'test_runtime': 3.7133,
 'test_samples_per_second': 29.892,
 'test_steps_per_second': 3.77}

In [10]:
for name, path in model_names.items():
    print(name, path)

GysBERT /ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2
BERTje /ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v2
BERT-multi-cased /ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2
WikiNEuRal Babelscape/wikineural-multilingual-ner


In [19]:
test = '              precision    recall  f1-score   support\n\n         LOC       0.48      0.47      0.48       312\n        MISC       0.00      0.00      0.00         0\n         ORG       0.00      0.00      0.00         0\n         PER       0.35      0.28      0.31       283\n        TIME       0.00      0.00      0.00       180\n\n   micro avg       0.29      0.29      0.29       775\n   macro avg       0.17      0.15      0.16       775\nweighted avg       0.32      0.29      0.30       775\n'

In [26]:
for line in test.split('\n')[1:]:
#     if len(line.split()) < 2:
#         continue
#     print(line)
#     print(line.split('  '))
    row = [item for item in line.split('  ') if item]
    if not row:
        continue
    print(row)

[' LOC', ' 0.48', '0.47', '0.48', ' 312']
['MISC', ' 0.00', '0.00', '0.00', ' 0']
[' ORG', ' 0.00', '0.00', '0.00', ' 0']
[' PER', ' 0.35', '0.28', '0.31', ' 283']
['TIME', ' 0.00', '0.00', '0.00', ' 180']
[' micro avg', ' 0.29', '0.29', '0.29', ' 775']
[' macro avg', ' 0.17', '0.15', '0.16', ' 775']
['weighted avg', ' 0.32', '0.29', '0.30', ' 775']


In [18]:
dct.values()

dict_values([Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 11104
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 2761
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 51
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 114
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 111
})])

In [19]:
import pandas as pd
import io   

dfs = []
for name, path in model_names.items():
    res = evaluate_with_trainer(path)
    column_names = ['model_name','dataset', 'label','precision','recall','f1-score','support']
    df = pd.DataFrame(columns=column_names, dtype=object)
    for test in tests: 
        for line in res[test]['test_classification_report'].split('\n')[1:]: # the first one is '\n\n'
            row = [item for item in line.split('  ') if item]
            if not row:
                continue
            df.loc[len(df)] = [name, test]+row
#     df = pd.read_csv(io.StringIO(res['test_classification_report']), sep="\t")
#     df['model_name'] = name
    dfs.append(df)

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6
  6 6 2 5 5 5 5 6 6 6 6 6 6 6 2 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 4
  4 4 6 6 2 5 5 5 5 5 5 6 0 0 0 6 6 6 1 4 4 4 1 6 6 6 6 6 6 6 1 4 4 1 6 6
  0 0 0 0 6 0 4 4 1 4 4 4 6 6 0 3 0 0 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 1 4 4 4 6 6 6 6 6 6 1 4 1 4 4 6 4 6 6 6 6 6 6 1 4 6 4 6 6 4 6
  6 6 6 6 6 6 6 6 4 6 6 6 1 4 4 6 6 1 1 4 4 4 4 6 6 6 6 1 4 0 6 6 6 6 6 6
  0 0 4 6 6 6 6 6 6 4 6 6 6 1 4 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 4 0 6 6 2 5 5 5 5 5 5 5 6 4]
 [6 6 1 4 4 4 4 1 4 6 6 6 6 6 6 6 0 3 6 6 0 0 6 6 0 6 6 6 6 0 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 6 0 6 6 6 6 6 6 6 6 6 2 2 5 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 1 4 4 4 1 6 6 6 6 0 0 6 6 6 6 6 6 6 6 6 1 4 4 1 6 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 0 0 6 6 0 0 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 4 4 6 6 6 6 6 6 1

[[6 1 4 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 2 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 4 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 ... 5 5 6]
 [6 6 1 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 1 4 ... 6 6 6]
 [6 6 6 ... 4 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 4 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 5 2 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 ... 6 6 6]
 [6 6 1 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 1 4 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 2 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 6]
 [0 0 0 ... 0 0 0]]


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 5 ... 0 0 0]]


[[0 5 0 ... 0 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
df_all = pd.concat(dfs)
df_all

Unnamed: 0,model_name,dataset,label,precision,recall,f1-score,support
0,GysBERT,test_NHA.txt,LOC,0.80,0.77,0.78,252
1,GysBERT,test_NHA.txt,PER,0.78,0.74,0.76,355
2,GysBERT,test_NHA.txt,TIME,0.32,0.27,0.29,109
3,GysBERT,test_NHA.txt,micro avg,0.73,0.68,0.70,716
4,GysBERT,test_NHA.txt,macro avg,0.63,0.59,0.61,716
...,...,...,...,...,...,...,...
26,WikiNEuRal,test_VOC.txt,PER,0.35,0.28,0.31,283
27,WikiNEuRal,test_VOC.txt,TIME,0.00,0.00,0.00,180
28,WikiNEuRal,test_VOC.txt,micro avg,0.29,0.29,0.29,775
29,WikiNEuRal,test_VOC.txt,macro avg,0.17,0.15,0.16,775


In [21]:
df_all.to_csv('NER_eval_results_alles.tsv',sep='\t')

In [7]:
evaluate_with_trainer(model_names['BERTje'])

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


{'test_loss': 0.07860732823610306,
 'test_precision': 0.695364238410596,
 'test_recall': 0.6844850065189049,
 'test_f1': 0.6898817345597898,
 'test_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.78      0.78      0.78       309\n         PER       0.62      0.59      0.60       278\n        TIME       0.66      0.67      0.66       180\n\n   micro avg       0.70      0.68      0.69       767\n   macro avg       0.69      0.68      0.68       767\nweighted avg       0.69      0.68      0.69       767\n',
 'test_runtime': 3.7048,
 'test_samples_per_second': 29.961,
 'test_steps_per_second': 3.779}

In [8]:
evaluate_with_trainer(model_names['GysBERT'])

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


{'test_loss': 0.11065434664487839,
 'test_precision': 0.6722797927461139,
 'test_recall': 0.6688144329896907,
 'test_f1': 0.6705426356589146,
 'test_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.75      0.78      0.76       313\n         PER       0.57      0.53      0.55       283\n        TIME       0.69      0.69      0.69       180\n\n   micro avg       0.67      0.67      0.67       776\n   macro avg       0.67      0.67      0.67       776\nweighted avg       0.67      0.67      0.67       776\n',
 'test_runtime': 3.7261,
 'test_samples_per_second': 29.79,
 'test_steps_per_second': 3.757}

In [11]:
preds

PredictionOutput(predictions=array([[[-0.8846255 , -0.20053461, -2.723233  , ..., -1.5204369 ,
         -3.548008  ,  9.511862  ],
        [ 5.6675086 , -0.5285765 , -2.26615   , ..., -1.6218034 ,
         -2.404584  ,  0.69367194],
        [-0.5783778 , -2.9816043 , -2.8909168 , ..., -1.5484601 ,
         -1.4718972 ,  6.1818666 ],
        ...,
        [ 2.3493235 , -1.2068609 , -3.5420794 , ..., -1.4829181 ,
         -3.1683383 ,  4.7888618 ],
        [-0.72117543, -0.9307784 , -2.2060997 , ..., -1.0914435 ,
         -3.1423402 ,  9.487557  ],
        [-1.569152  , -1.2889202 , -2.636608  , ...,  0.15824848,
         -2.236063  ,  7.421688  ]],

       [[-1.5841635 , -0.7333175 , -2.3293574 , ..., -1.359286  ,
         -2.8213515 ,  9.910175  ],
        [-2.312314  , -1.5694518 , -1.9815735 , ..., -0.73888   ,
         -1.6240867 ,  9.370352  ],
        [-2.071714  , -0.81145626, -2.708225  , ..., -0.68706286,
         -1.5662781 ,  8.74781   ],
        ...,
        [-1.3483778 , -0.

In [17]:
def evaluate(model_name="/ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2"):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
    global id2label
    id2label = model.config.id2label
    global label2id
    label2id = model.config.label2id
    
    toks = data_collator(tokenized_datasets['test'])
    preds = model(toks['input_ids'])
    
#     print(preds.logits[0]) # todo remove

    return compute_metrics((preds.logits.detach().numpy(), tokenized_datasets['test']['labels']))

In [None]:
model_name=model_names["WikiNEuRal"]
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name)

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
id2label = model.config.id2label
label2id = model.config.label2id
    
toks = data_collator(tokenized_datasets['test'])
preds = model(toks['input_ids'])


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

In [None]:
preds.logits[0][3]

In [None]:
compute_metrics((preds.logits.detach().numpy(), tokenized_datasets['test']['labels'])

In [38]:
text = "Amsterdam is mooi, Timor is nog mooier"
toks_new = tokenizer(text,return_tensors='pt')
toks_new

{'input_ids': tensor([[    2,  1945,  1508,  4626,    16, 24360,  1508,  1658, 21885,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
collated = data_collator(tokenized_datasets['test'])
collated

{'input_ids': tensor([[    2, 24360,  1456,  ...,     0,     0,     0],
         [    2,  2763,    18,  ...,     0,     0,     0],
         [    2, 13600,  2942,  ...,     0,     0,     0],
         ...,
         [    2,  7213,  9280,  ...,     0,     0,     0],
         [    2,  1486,  2821,  ...,     0,     0,     0],
         [    2,  8635,  5744,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[-100,    0,    6,  ..., -100, -100, -100],
         [-100,    6,    6,  ..., -100, -100, -100],
         [-100,    6,    6,  ..., -100, -100, -100],
         ...,
         [-100,    6, -100,  ..., -100, -100, -100],
         [-100,    6,    6,  ..., -100, -100, -100],
         [-100,    1,    4,  ..., -100, -100, -100]])}

In [10]:
model(collated['input_ids'][0])

ValueError: not enough values to unpack (expected 2, got 1)

In [41]:
model(toks_new['input_ids'])

TokenClassifierOutput(loss=None, logits=tensor([[[-0.5427,  0.6064, -0.9945, -2.6852, -2.6771, -1.9216,  7.4962],
         [ 7.5385, -2.3297, -1.9815,  2.2804, -3.7618, -3.9279,  1.5158],
         [-2.7782, -1.5884, -3.4236, -1.4898, -1.4373, -2.4613, 10.2054],
         [ 1.0658, -0.6538, -2.0952, -2.4094, -3.1959, -3.2366,  7.6695],
         [-2.5549, -2.0135, -3.2106, -0.1299, -1.0148, -2.7319,  9.3680],
         [-2.1994, -0.4250, -2.8736, -1.6528, -1.0364, -3.7920,  9.7953],
         [ 8.4146, -1.4474,  1.5184, -1.6112, -3.6887, -1.9291, -0.6003],
         [ 7.2142, -2.1879, -0.0754,  2.4800, -3.8119, -1.2636, -1.2760],
         [-2.3924, -1.8151, -2.9774, -1.0514, -1.6791, -2.8544, 10.5226],
         [ 8.8415, -1.3115, -0.3948, -0.2439, -4.4734, -2.5845, -0.8180],
         [ 3.9327, -3.4434, -3.3043,  7.1931, -2.8903, -1.3326, -0.2706],
         [-2.7918, -1.4254, -2.2763, -2.7268, -2.1909, -2.0980, 11.1556],
         [-2.5494, -1.6920, -2.1994, -3.0338, -2.1286, -2.1863, 11.3971]

In [42]:
tokenized_datasets['test']['tokens'][0]

['Timor',
 'en',
 'Passant',
 'over',
 'sirbon',
 'en',
 'Japara',
 'brieven',
 'daar',
 'mede',
 'afgegaan',
 'de',
 'geschillen',
 'tusschen',
 'de',
 'amarassiers',
 'en',
 'Coepanse',
 'bontgenoten;',
 'Continueeren',
 'nog',
 'naden',
 'onden',
 '133',
 '8.',
 'lasten',
 'rijs',
 '100',
 '.',
 'Jatij',
 'en',
 '20.',
 'affuijt',
 'planken',
 'En',
 'bedrage',
 'van',
 '�5532:',
 '14',
 ':',
 'waarom',
 'bij',
 'ons',
 'goetgedagt',
 'is,',
 'dat',
 'bodempje',
 'vermits',
 'het',
 'vroeg',
 'genoeg',
 'in',
 'den',
 'tijt',
 'was',
 'en',
 'passant',
 'ook',
 'sirbon',
 'en',
 'Japara',
 'te',
 'laten',
 'aangieren,',
 'om',
 'met',
 'eenen',
 'een',
 'partij',
 'Coopmansz',
 ':',
 'en',
 'provisien',
 'vor',
 'die',
 'comptoiren',
 'over',
 'te',
 'brengen,',
 'met',
 'alsulken',
 'principaal',
 'antwoort',
 'schrijven',
 'aan',
 'den',
 'onder',
 'coopman',
 'Willem',
 'moerman',
 'en',
 'raad,',
 'als',
 'nevens',
 'het',
 'vorige',
 'praeadvis,',
 'in',
 '�t',
 'afgaande',
 'g

In [43]:
tokenized_datasets['test']['labels'][0]

[-100,
 0,
 6,
 6,
 -100,
 6,
 0,
 -100,
 6,
 0,
 -100,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 -100,
 -100,
 6,
 6,
 -100,
 -100,
 6,
 -100,
 -100,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 6,
 6,
 -100,
 -100,
 6,
 -100,
 6,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 0,
 -100,
 6,
 0,
 -100,
 6,
 6,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 6,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 -100,
 6,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 1,
 1,
 -100,
 6,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 -100,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 -100,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 -100,
 -100,
 6,
 6,
 -100,
 -100,
 -100,
 6,
 -100,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 6,
 -100,
 -100,
 6,

In [28]:
toks['labels']

tensor([[-100,    0,    6,  ..., -100, -100, -100],
        [-100,    6,    6,  ..., -100, -100, -100],
        [-100,    6,    6,  ..., -100, -100, -100],
        ...,
        [-100,    6, -100,  ..., -100, -100, -100],
        [-100,    6,    6,  ..., -100, -100, -100],
        [-100,    1,    4,  ..., -100, -100, -100]])

In [30]:
preds.logits[0][1]

tensor([-0.9815, -2.1087, -2.7903, -1.9487, -2.5880, -2.3849, 10.7188],
       grad_fn=<SelectBackward0>)

In [26]:
id2label

{0: 'B-LOC',
 1: 'B-PER',
 2: 'B-TIME',
 3: 'I-LOC',
 4: 'I-PER',
 5: 'I-TIME',
 6: 'O'}

In [None]:
compute_metrics((preds.logits.detach().numpy(), tokenized_datasets['test']['labels']))

In [18]:
evaluate(model_names['GysBERT'])

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

TokenClassifierOutput(loss=None, logits=tensor([[[-1.1056, -2.0907, -2.6797,  ..., -2.4105, -2.3044, 10.5855],
         [-0.9815, -2.1087, -2.7903,  ..., -2.5880, -2.3849, 10.7188],
         [-1.2446, -2.2667, -2.6471,  ..., -2.5930, -2.2541, 10.8799],
         ...,
         [-1.1830, -1.8927, -2.7616,  ..., -2.4504, -2.4548, 10.6490],
         [-1.1991, -1.9389, -2.7844,  ..., -2.3972, -2.4331, 10.6495],
         [-1.1722, -1.9827, -2.7269,  ..., -2.4549, -2.4222, 10.6663]],

        [[-1.0231, -1.9746, -2.4849,  ..., -1.9830, -2.3571,  9.8590],
         [-1.1887, -2.3329, -2.3563,  ..., -2.5579, -2.3397, 10.5914],
         [-1.2042, -2.5399, -2.5790,  ..., -2.5207, -2.2575, 10.8027],
         ...,
         [-1.1108, -1.6888, -2.7046,  ..., -2.1600, -2.7299, 10.1716],
         [-1.0869, -1.6983, -2.7277,  ..., -2.0823, -2.7267, 10.1372],
         [-1.0483, -1.7725, -2.6648,  ..., -2.1668, -2.7070, 10.2009]],

        [[-1.0625, -2.1565, -2.7420,  ..., -2.5423, -2.1257, 10.5858],
     

{'precision': 0.6419753086419753,
 'recall': 0.06701030927835051,
 'f1': 0.12135355892648775,
 'classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.85      0.11      0.19       313\n         PER       0.41      0.03      0.06       283\n        TIME       0.50      0.06      0.10       180\n\n   micro avg       0.64      0.07      0.12       776\n   macro avg       0.59      0.06      0.12       776\nweighted avg       0.61      0.07      0.12       776\n'}

In [9]:
eval_results = {}

In [11]:
for model_name in model_names.keys():
    model_path = model_names[model_name]
    eval_results[model_name] = evaluate(model_path)

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
eval_results