In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [2]:
## The data

In [3]:
from data_utils import prepare_data, convert_to_dataset

In [4]:

train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')
test_VOC = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_VOC.txt')

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

In [5]:
label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

In [6]:
train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = convert_to_dataset(test_VOC, label_map)

datasets = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data,
})


In [7]:
## Evaluation

In [8]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [9]:
# model_name = "/ivi/ilps/personal/vprovat/KB/GysBERT"
model_name = "GroNLP/bert-base-dutch-cased"
# model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_list))

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased

In [10]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

In [11]:
label_list

['B-LOC', 'B-PER', 'B-TIME', 'I-LOC', 'I-PER', 'I-TIME', 'O']

In [12]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to('cuda')
        # forward pass
        outputs = model(**inputs)
#         print('ran the model')
        logits = outputs.get('logits').to('cuda')
#         print('got the logits')
        labels = labels.to(logits.device)
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.2, 0.2, 0.05, 0.2,0.2,0.05,0.1]).to('cuda')) 
        # ['B-LOC', 'B-PER', 'B-TIME', 'I-LOC', 'I-PER', 'I-TIME', 'O']
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [13]:
from transformers import EarlyStoppingCallback, IntervalStrategy

training_args = TrainingArguments(
    output_dir="/ivi/ilps/personal/vprovat/KB/NER_logs_BERTje_recall", #change here
    evaluation_strategy=IntervalStrategy.STEPS, #"steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    save_total_limit = 50,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="recall",
) # todo check dropout

In [20]:
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
igno

In [15]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [16]:
import gc
torch.cuda.empty_cache()
gc.collect()


4034

In [17]:
trainer.train(resume_from_checkpoint=False)


[34m[1mwandb[0m: Currently logged in as: [33mvsprovatorova[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
500,0.1218,0.124091,0.618142,0.66482,0.640632,precision recall f1-score support  LOC 0.61 0.79 0.69 7311  PER 0.64 0.66 0.65 14102  TIME 0.54 0.48 0.51 4755  micro avg 0.62 0.66 0.64 26168  macro avg 0.60 0.64 0.62 26168 weighted avg 0.62 0.66 0.64 26168
1000,0.1198,0.113099,0.658163,0.700971,0.678893,precision recall f1-score support  LOC 0.75 0.78 0.76 7311  PER 0.63 0.71 0.67 14102  TIME 0.59 0.55 0.57 4755  micro avg 0.66 0.70 0.68 26168  macro avg 0.66 0.68 0.67 26168 weighted avg 0.66 0.70 0.68 26168
1500,0.0904,0.108253,0.681083,0.710104,0.695291,precision recall f1-score support  LOC 0.71 0.83 0.76 7311  PER 0.69 0.71 0.70 14102  TIME 0.61 0.54 0.58 4755  micro avg 0.68 0.71 0.70 26168  macro avg 0.67 0.69 0.68 26168 weighted avg 0.68 0.71 0.69 26168
2000,0.088,0.103323,0.717213,0.710333,0.713756,precision recall f1-score support  LOC 0.73 0.82 0.77 7311  PER 0.74 0.70 0.72 14102  TIME 0.62 0.57 0.60 4755  micro avg 0.72 0.71 0.71 26168  macro avg 0.70 0.70 0.70 26168 weighted avg 0.72 0.71 0.71 26168
2500,0.0843,0.105049,0.722551,0.713964,0.718232,precision recall f1-score support  LOC 0.72 0.82 0.76 7311  PER 0.75 0.70 0.72 14102  TIME 0.64 0.60 0.62 4755  micro avg 0.72 0.71 0.72 26168  macro avg 0.70 0.71 0.70 26168 weighted avg 0.72 0.71 0.72 26168
3000,0.0682,0.109432,0.735317,0.71767,0.726387,precision recall f1-score support  LOC 0.77 0.81 0.79 7311  PER 0.74 0.71 0.73 14102  TIME 0.65 0.60 0.62 4755  micro avg 0.74 0.72 0.73 26168  macro avg 0.72 0.71 0.71 26168 weighted avg 0.73 0.72 0.73 26168
3500,0.0727,0.107375,0.711751,0.717327,0.714528,precision recall f1-score support  LOC 0.73 0.83 0.78 7311  PER 0.72 0.70 0.71 14102  TIME 0.64 0.61 0.63 4755  micro avg 0.71 0.72 0.71 26168  macro avg 0.70 0.71 0.70 26168 weighted avg 0.71 0.72 0.71 26168
4000,0.0666,0.109783,0.71908,0.709875,0.714448,precision recall f1-score support  LOC 0.73 0.79 0.76 7311  PER 0.73 0.70 0.72 14102  TIME 0.66 0.61 0.63 4755  micro avg 0.72 0.71 0.71 26168  macro avg 0.71 0.70 0.70 26168 weighted avg 0.72 0.71 0.71 26168
4500,0.0568,0.115946,0.726422,0.721148,0.723776,precision recall f1-score support  LOC 0.74 0.81 0.78 7311  PER 0.74 0.71 0.73 14102  TIME 0.65 0.61 0.63 4755  micro avg 0.73 0.72 0.72 26168  macro avg 0.71 0.71 0.71 26168 weighted avg 0.73 0.72 0.72 26168
5000,0.0533,0.113551,0.720174,0.716486,0.718325,precision recall f1-score support  LOC 0.74 0.80 0.77 7311  PER 0.73 0.71 0.72 14102  TIME 0.64 0.61 0.63 4755  micro avg 0.72 0.72 0.72 26168  macro avg 0.71 0.71 0.71 26168 weighted avg 0.72 0.72 0.72 26168


Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.61      0.79      0.69      7311
         PER       0.64      0.66      0.65     14102
        TIME       0.54      0.48      0.51      4755

   micro avg       0.62      0.66      0.64     26168
   macro avg       0.60      0.64      0.62     26168
weighted avg       0.62      0.66      0.64     26168
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.75      0.78      0.76      7311
         PER       0.63      0.71      0.67     14102
        TIME       0.59      0.55      0.57      4755

   micro avg       0.66      0.70      0.68     26168
   macro avg       0.66      0.68      0.67     26168
weighted avg       0.66      0.70      0.6

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.76      0.80      0.78      7311
         PER       0.73      0.70      0.71     14102
        TIME       0.64      0.59      0.61      4755

   micro avg       0.72      0.71      0.71     26168
   macro avg       0.71      0.70      0.70     26168
weighted avg       0.72      0.71      0.71     26168
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.75      0.81      0.78      7311
         PER       0.74      0.71      0.72     14102
        TIME       0.64      0.61      0.63      4755

   micro avg       0.73      0.72      0.72     26168
   macro avg       0.71      0.71      0.71     26168
weighted avg       0.72      0.72      0.7

TrainOutput(global_step=8000, training_loss=0.07176743483543396, metrics={'train_runtime': 7496.1126, 'train_samples_per_second': 22.22, 'train_steps_per_second': 2.777, 'total_flos': 1.6723748388864e+16, 'train_loss': 0.07176743483543396, 'epoch': 5.76})

In [18]:
trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id

In [19]:
trainer.save_model('/ivi/ilps/personal/vprovat/KB/models/BERTje-NER-v4')