In [1]:
tests = 'test_NHA.txt  test_RHC.txt  test_SA.txt  test_VOC.txt'.split()
tests

['test_NHA.txt', 'test_RHC.txt', 'test_SA.txt', 'test_VOC.txt']

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from data_utils import prepare_data, convert_to_dataset

# train and val are redundant but we need the labels, sooo
train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')

tests_prepared = [
     prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/'+test) for test in tests
]

label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
id2label_original = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = [convert_to_dataset(test, label_map)
             for test in tests_prepared]

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

In [3]:
dct = {
    "train": train_data,
    "validation": val_data}
for i, test in enumerate(tests):
    dct[test.split('.')[0]] = test_data[i]

datasets = DatasetDict(dct)

In [4]:
def extract_wrong_predictions(eval_prediction): # experimental, will be improved
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label_original[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    res = []
    for i in range(len(true_predictions)):
        if true_predictions[i] != true_labels[i]:
            res.append((i, true_predictions[i],true_labels[i]))
    return res


def get_wrong_predictions(model_name, dataset_name):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    global model
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
    
    global id2label
    id2label = model.config.id2label
#     global label2id
#     label2id = model.config.label2id
    
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    

    preds = trainer.predict(tokenized_datasets[dataset_name])
    wrong_preds = extract_wrong_predictions((preds.predictions,
                                   preds.label_ids)
                                   )
    return wrong_preds


In [13]:
from sklearn.metrics import confusion_matrix
def get_predictions(model_name, dataset_name):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    global model
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
    
    global id2label
    id2label = model.config.id2label
#     global label2id
#     label2id = model.config.label2id
    
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    

    preds = trainer.predict(tokenized_datasets[dataset_name])
    return preds
    
#     matrix = confusion_matrix(preds.label_ids, preds.predictions, label_list)
#     return matrix

In [5]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)
#     print(predictions) # todo remove

    # for evaluating wikineural, which has a different number of labels:
    # set the extra labels to 'O' and hope for the best
#     filtered_predictions = [
#         [p if p < len(label_list) else label2id['O'] for p in prediction]
#         for prediction in predictions
#     ]
    
#     predictions = filtered_predictions

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label_original[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # todo: remove debug output
#     print(true_predictions[0])
#     print(true_labels[0])

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True, return_tensors='pt'
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


In [6]:
model_names = {'GysBERT': "/ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2",
              'BERTje': "/ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v2",
              'BERT-multi-cased': "/ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2",
              'WikiNEuRal': "Babelscape/wikineural-multilingual-ner"}

In [7]:
def evaluate_with_trainer(model_name):
# if True:
#     model_name = model_names['BERTje']
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    global model
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
    
    global id2label
    id2label = model.config.id2label
    global label2id
    label2id = model.config.label2id
    
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    res = {}
    for test in tests:
        dataset_name = test.split('.')[0]
        preds = trainer.predict(tokenized_datasets[dataset_name])
        res[test] = preds.metrics
    return res

In [8]:
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 11104
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2761
    })
    test_NHA: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 51
    })
    test_RHC: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3
    })
    test_SA: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 114
    })
    test_VOC: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 111
    })
})

In [9]:
'mistakes_on_{0}.p'.format('test')

'mistakes_on_test.p'

In [10]:
# error analysis

import pickle

for test in tests:
    wrong_preds_dict = {}
    test_name = test.split('.')[0]
    for name, path in model_names.items():
        wrong_preds = get_wrong_predictions(path, test_name)
        wrong_preds_dict[name] = wrong_preds
        
    pickle.dump(wrong_preds_dict, open('mistakes_on_{0}.p'.format(test_name),'wb'))

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

In [14]:
# Build confusion matrices

preds = get_predictions(model_names['WikiNEuRal'], 'test_RHC')

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
predictions = np.argmax(preds.predictions, axis=2)
labels = preds.label_ids
    
# Remove ignored index (special tokens)
true_predictions = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2label_original[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

In [24]:
true_labels

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-TIME',
  'I-TIME',
  'I-TIME',
  'I-TIME',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-TIME',
  'I-TIME',
  'I-TIME',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'B-PER',
  'I-PER',
  'I-P

In [21]:
label_list

['B-LOC', 'B-PER', 'B-TIME', 'I-LOC', 'I-PER', 'I-TIME', 'O']

In [33]:
flattened_preds = [pred for true_preds in true_predictions for pred in true_preds ]
flattened_labels = [lab for true_labs in true_labels for lab in true_labs ]

In [36]:
matrix = confusion_matrix(flattened_labels, flattened_preds, label_list)



In [37]:
matrix

array([[ 13,   0,   0,   3,   0,   0,   1],
       [  0,  14,   0,   0,   4,   0,   6],
       [  0,   0,   0,   0,   0,   0,   5],
       [  0,   0,   0,   1,   0,   0,   0],
       [  0,   1,   0,   0,  32,   0,   3],
       [  0,   0,   0,   0,   0,   0,  12],
       [ 11,   0,   0,   4,   5,   0, 391]])

In [6]:
evaluate_with_trainer(model_names['WikiNEuRal'])

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[0 5 0 ... 0 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 11.0033540725708,
 'test_precision': 0.2912371134020619,
 'test_recall': 0.29161290322580646,
 'test_f1': 0.29142488716956805,
 'test_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.48      0.47      0.48       312\n        MISC       0.00      0.00      0.00         0\n         ORG       0.00      0.00      0.00         0\n         PER       0.35      0.28      0.31       283\n        TIME       0.00      0.00      0.00       180\n\n   micro avg       0.29      0.29      0.29       775\n   macro avg       0.17      0.15      0.16       775\nweighted avg       0.32      0.29      0.30       775\n',
 'test_runtime': 3.7133,
 'test_samples_per_second': 29.892,
 'test_steps_per_second': 3.77}

In [10]:
for name, path in model_names.items():
    print(name, path)

GysBERT /ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2
BERTje /ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v2
BERT-multi-cased /ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2
WikiNEuRal Babelscape/wikineural-multilingual-ner


In [19]:
test = '              precision    recall  f1-score   support\n\n         LOC       0.48      0.47      0.48       312\n        MISC       0.00      0.00      0.00         0\n         ORG       0.00      0.00      0.00         0\n         PER       0.35      0.28      0.31       283\n        TIME       0.00      0.00      0.00       180\n\n   micro avg       0.29      0.29      0.29       775\n   macro avg       0.17      0.15      0.16       775\nweighted avg       0.32      0.29      0.30       775\n'

In [26]:
for line in test.split('\n')[1:]:
#     if len(line.split()) < 2:
#         continue
#     print(line)
#     print(line.split('  '))
    row = [item for item in line.split('  ') if item]
    if not row:
        continue
    print(row)

[' LOC', ' 0.48', '0.47', '0.48', ' 312']
['MISC', ' 0.00', '0.00', '0.00', ' 0']
[' ORG', ' 0.00', '0.00', '0.00', ' 0']
[' PER', ' 0.35', '0.28', '0.31', ' 283']
['TIME', ' 0.00', '0.00', '0.00', ' 180']
[' micro avg', ' 0.29', '0.29', '0.29', ' 775']
[' macro avg', ' 0.17', '0.15', '0.16', ' 775']
['weighted avg', ' 0.32', '0.29', '0.30', ' 775']


In [18]:
dct.values()

dict_values([Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 11104
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 2761
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 51
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 114
}), Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 111
})])

In [19]:
import pandas as pd
import io   

dfs = []
for name, path in model_names.items():
    res = evaluate_with_trainer(path)
    column_names = ['model_name','dataset', 'label','precision','recall','f1-score','support']
    df = pd.DataFrame(columns=column_names, dtype=object)
    for test in tests: 
        for line in res[test]['test_classification_report'].split('\n')[1:]: # the first one is '\n\n'
            row = [item for item in line.split('  ') if item]
            if not row:
                continue
            df.loc[len(df)] = [name, test]+row
#     df = pd.read_csv(io.StringIO(res['test_classification_report']), sep="\t")
#     df['model_name'] = name
    dfs.append(df)

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6
  6 6 2 5 5 5 5 6 6 6 6 6 6 6 2 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 4
  4 4 6 6 2 5 5 5 5 5 5 6 0 0 0 6 6 6 1 4 4 4 1 6 6 6 6 6 6 6 1 4 4 1 6 6
  0 0 0 0 6 0 4 4 1 4 4 4 6 6 0 3 0 0 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 1 4 4 4 6 6 6 6 6 6 1 4 1 4 4 6 4 6 6 6 6 6 6 1 4 6 4 6 6 4 6
  6 6 6 6 6 6 6 6 4 6 6 6 1 4 4 6 6 1 1 4 4 4 4 6 6 6 6 1 4 0 6 6 6 6 6 6
  0 0 4 6 6 6 6 6 6 4 6 6 6 1 4 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 4 0 6 6 2 5 5 5 5 5 5 5 6 4]
 [6 6 1 4 4 4 4 1 4 6 6 6 6 6 6 6 0 3 6 6 0 0 6 6 0 6 6 6 6 0 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 6 0 6 6 6 6 6 6 6 6 6 2 2 5 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 1 4 4 4 1 6 6 6 6 0 0 6 6 6 6 6 6 6 6 6 1 4 4 1 6 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 0 0 6 6 0 0 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
  6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 4 4 6 6 6 6 6 6 1

[[6 1 4 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 2 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 4 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 ... 5 5 6]
 [6 6 1 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 1 4 ... 6 6 6]
 [6 6 6 ... 4 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 4 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 5 2 6]
 [6 6 6 ... 6 6 6]]


[[6 6 6 ... 6 6 6]
 [6 6 1 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 1 4 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 2 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]]


[[6 0 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 ...
 [6 6 6 ... 6 6 6]
 [6 6 6 ... 6 6 6]
 [6 1 4 ... 6 6 6]]


Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 6]
 [0 0 0 ... 0 0 0]]


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 5 ... 0 0 0]]


[[0 5 0 ... 0 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
df_all = pd.concat(dfs)
df_all

Unnamed: 0,model_name,dataset,label,precision,recall,f1-score,support
0,GysBERT,test_NHA.txt,LOC,0.80,0.77,0.78,252
1,GysBERT,test_NHA.txt,PER,0.78,0.74,0.76,355
2,GysBERT,test_NHA.txt,TIME,0.32,0.27,0.29,109
3,GysBERT,test_NHA.txt,micro avg,0.73,0.68,0.70,716
4,GysBERT,test_NHA.txt,macro avg,0.63,0.59,0.61,716
...,...,...,...,...,...,...,...
26,WikiNEuRal,test_VOC.txt,PER,0.35,0.28,0.31,283
27,WikiNEuRal,test_VOC.txt,TIME,0.00,0.00,0.00,180
28,WikiNEuRal,test_VOC.txt,micro avg,0.29,0.29,0.29,775
29,WikiNEuRal,test_VOC.txt,macro avg,0.17,0.15,0.16,775


In [21]:
df_all.to_csv('NER_eval_results_alles.tsv',sep='\t')