In [1]:
tests_old = 'test_NHA.txt  test_RHC.txt  test_SA.txt  test_VOC.txt'.split()
# tests

In [2]:
tests = tests_old +['ned.testb'] # for the names

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from data_utils import prepare_data, convert_to_dataset

# train and val are redundant but we need the labels, sooo
train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')

tests_prepared = [
     prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/'+test) for test in tests_old
]

tests_prepared += [
     prepare_data('/ivi/ilps/personal/vprovat/KB/data/Dutch_conll/ned.testb')
]

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

  0%|          | 0/5076 [00:00<?, ?it/s]

In [4]:
label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))

label_list = label_list
# print(label_list_new)
id2label_original = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
label_map = {label: i for i, label in enumerate(label_list)}
label_map.update({'B-ORG': label2id['O'],
                  'B-MISC': label2id['O'], 
                  'I-ORG': label2id['O'],
                  'I-MISC': label2id['O']})

train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = [convert_to_dataset(test, label_map)
             for test in tests_prepared]

In [5]:
label_list

['B-LOC', 'B-PER', 'B-TIME', 'I-LOC', 'I-PER', 'I-TIME', 'O']

In [6]:
id2label = id2label_original

In [7]:
dct = {
    "train": train_data,
    "validation": val_data}
for i, test in enumerate(tests):
    dct[test.split('.')[0]] = test_data[i]

datasets = DatasetDict(dct)

In [8]:
def get_predictions(model_name, dataset_name):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    global model
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
    
    global id2label
    id2label = model.config.id2label
#     global label2id
#     label2id = model.config.label2id
    
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    

    preds = trainer.predict(tokenized_datasets[dataset_name])
    return preds

In [9]:
def clean_predictions(preds):
    predictions = np.argmax(preds.predictions, axis=2)
    labels = preds.label_ids

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label_original[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return true_predictions, true_labels

In [10]:
def extract_wrong_predictions(preds): # experimental, will be improved
    true_predictions, true_labels = clean_predictions(preds)
    
    res = []
    for i in range(len(true_predictions)):
        if true_predictions[i] != true_labels[i]:
            res.append((i, true_predictions[i],true_labels[i]))
    return res


In [12]:
def compute_metrics(eval_prediction):
    true_predictions, true_labels = clean_predictions(eval_prediction)
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True, return_tensors='pt'
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


In [11]:
model_names = {'GysBERT': "/ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2",
              'BERTje': "/ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v2",
              'BERT-multi-cased': "/ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2",
              'WikiNEuRal': "Babelscape/wikineural-multilingual-ner"}

In [12]:
import pickle
all_preds=pickle.load(open('all_predictions.p','rb'))

In [13]:
all_wrong_preds = {}

for model_name in model_names.keys():
    wrong_preds = {}
    model = AutoModelForTokenClassification.from_pretrained(model_names[model_name]) # just to get the labels for WikiNEuRal
    id2label = model.config.id2label
    for test in tests:
        preds = all_preds[model_name][test]
        wrong_preds[test] = extract_wrong_predictions(preds)
    all_wrong_preds[model_name] = wrong_preds

In [14]:
pickle.dump(all_wrong_preds, open('all_wrong_preds.p','wb'))

In [15]:
tests

['test_NHA.txt', 'test_RHC.txt', 'test_SA.txt', 'test_VOC.txt', 'ned.testb']