In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [2]:
## The data

In [3]:
from data_utils import prepare_data, convert_to_dataset

In [4]:

train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')
test_VOC = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_VOC.txt')

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

In [5]:
label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

In [6]:
train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = convert_to_dataset(test_VOC, label_map)

datasets = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data,
})


In [7]:
## Evaluation

In [8]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [9]:
# model_name = "/ivi/ilps/personal/vprovat/KB/GysBERT"
# model_name = "GroNLP/bert-base-dutch-cased"
model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_list))

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [10]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

In [11]:
from transformers import EarlyStoppingCallback, IntervalStrategy

training_args = TrainingArguments(
    output_dir="/ivi/ilps/personal/vprovat/KB/NER_logs_BERT-multi-cased", #change here
    evaluation_strategy=IntervalStrategy.STEPS, #"steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    save_total_limit = 50,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [None]:
# dir(training_args)

In [13]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [15]:
trainer.train(resume_from_checkpoint=True)

[34m[1mwandb[0m: Currently logged in as: [33mvsprovatorova[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
500,0.1334,0.132026,0.583432,0.615314,0.598949,precision recall f1-score support  LOC 0.56 0.73 0.64 6988  PER 0.62 0.58 0.60 13330  TIME 0.52 0.53 0.52 4588  micro avg 0.58 0.62 0.60 24906  macro avg 0.57 0.61 0.59 24906 weighted avg 0.59 0.62 0.60 24906
1000,0.1223,0.114149,0.625261,0.649161,0.636987,precision recall f1-score support  LOC 0.73 0.71 0.72 6988  PER 0.59 0.64 0.61 13330  TIME 0.58 0.58 0.58 4588  micro avg 0.63 0.65 0.64 24906  macro avg 0.63 0.64 0.64 24906 weighted avg 0.63 0.65 0.64 24906
1500,0.0995,0.109193,0.645037,0.665342,0.655032,precision recall f1-score support  LOC 0.71 0.75 0.73 6988  PER 0.64 0.65 0.64 13330  TIME 0.55 0.59 0.57 4588  micro avg 0.65 0.67 0.66 24906  macro avg 0.64 0.66 0.65 24906 weighted avg 0.65 0.67 0.66 24906
2000,0.0954,0.101876,0.669386,0.66229,0.665819,precision recall f1-score support  LOC 0.72 0.76 0.74 6988  PER 0.66 0.63 0.65 13330  TIME 0.60 0.61 0.61 4588  micro avg 0.67 0.66 0.67 24906  macro avg 0.66 0.67 0.66 24906 weighted avg 0.67 0.66 0.67 24906
2500,0.092,0.097988,0.674522,0.676905,0.675711,precision recall f1-score support  LOC 0.71 0.77 0.74 6988  PER 0.68 0.65 0.66 13330  TIME 0.61 0.62 0.61 4588  micro avg 0.67 0.68 0.68 24906  macro avg 0.67 0.68 0.67 24906 weighted avg 0.67 0.68 0.67 24906
3000,0.0772,0.103555,0.656686,0.693046,0.674376,precision recall f1-score support  LOC 0.72 0.77 0.74 6988  PER 0.64 0.67 0.66 13330  TIME 0.60 0.63 0.62 4588  micro avg 0.66 0.69 0.67 24906  macro avg 0.65 0.69 0.67 24906 weighted avg 0.66 0.69 0.67 24906
3500,0.0802,0.095889,0.697104,0.67558,0.686173,precision recall f1-score support  LOC 0.76 0.75 0.76 6988  PER 0.70 0.65 0.67 13330  TIME 0.61 0.63 0.62 4588  micro avg 0.70 0.68 0.69 24906  macro avg 0.69 0.68 0.68 24906 weighted avg 0.70 0.68 0.69 24906
4000,0.0756,0.094798,0.70115,0.67297,0.686771,precision recall f1-score support  LOC 0.79 0.73 0.76 6988  PER 0.69 0.66 0.68 13330  TIME 0.61 0.62 0.62 4588  micro avg 0.70 0.67 0.69 24906  macro avg 0.70 0.67 0.68 24906 weighted avg 0.70 0.67 0.69 24906
4500,0.0644,0.100108,0.70875,0.679354,0.693741,precision recall f1-score support  LOC 0.78 0.75 0.76 6988  PER 0.70 0.66 0.68 13330  TIME 0.62 0.63 0.63 4588  micro avg 0.71 0.68 0.69 24906  macro avg 0.70 0.68 0.69 24906 weighted avg 0.71 0.68 0.69 24906
5000,0.0595,0.096935,0.714927,0.684413,0.699337,precision recall f1-score support  LOC 0.78 0.75 0.76 6988  PER 0.71 0.67 0.69 13330  TIME 0.64 0.63 0.63 4588  micro avg 0.71 0.68 0.70 24906  macro avg 0.71 0.68 0.69 24906 weighted avg 0.72 0.68 0.70 24906


Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.56      0.73      0.64      6988
         PER       0.62      0.58      0.60     13330
        TIME       0.52      0.53      0.52      4588

   micro avg       0.58      0.62      0.60     24906
   macro avg       0.57      0.61      0.59     24906
weighted avg       0.59      0.62      0.60     24906
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.73      0.71      0.72      6988
         PER       0.59      0.64      0.61     13330
        TIME       0.58      0.58      0.58      4588

   micro avg       0.63      0.65      0.64     24906
   macro avg       0.63      0.64      0.64     24906
weighted avg       0.63      0.65      0.6

TrainOutput(global_step=6500, training_loss=0.08687260246276855, metrics={'train_runtime': 5857.2696, 'train_samples_per_second': 28.436, 'train_steps_per_second': 3.555, 'total_flos': 1.3588045565952e+16, 'train_loss': 0.08687260246276855, 'epoch': 4.68})

In [16]:
trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id

In [17]:
trainer.save_model('/ivi/ilps/personal/vprovat/KB/models/BERT-multi-cased-NER-v2')

In [18]:
preds = trainer.predict(tokenized_datasets["test"])

In [21]:
preds

PredictionOutput(predictions=array([[[-0.11308523,  0.28306374, -2.5450878 , ..., -0.8496628 ,
         -2.0836978 ,  4.238312  ],
        [ 5.108827  ,  0.13123973, -3.0826669 , ..., -2.337807  ,
         -3.535562  ,  2.442111  ],
        [-0.7368386 , -3.0711036 , -2.941254  , ..., -1.3434206 ,
         -2.6588874 ,  6.7481556 ],
        ...,
        [-1.2727859 , -1.6499017 , -2.8777916 , ..., -2.0274026 ,
         -2.2507133 ,  9.080972  ],
        [-1.2646996 , -1.7404342 , -2.8056834 , ..., -2.0401525 ,
         -2.2040095 ,  9.123766  ],
        [-0.23187871, -0.14783058, -2.9348915 , ..., -1.9613186 ,
         -2.44029   ,  7.005603  ]],

       [[ 0.27685595, -0.21969439, -2.2987916 , ..., -0.6269867 ,
         -1.7885745 ,  2.8498516 ],
        [-2.8790243 , -0.8633581 , -1.3185526 , ..., -0.717562  ,
         -0.34523863,  5.5737166 ],
        [-3.0382226 , -1.6483521 , -2.8082068 , ..., -0.6817659 ,
          0.8414026 ,  5.481147  ],
        ...,
        [ 0.6043987 , -0.

In [1]:
preds['metrics']

NameError: name 'preds' is not defined