In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [2]:
## The data

In [3]:
from data_utils import prepare_data, convert_to_dataset

In [4]:

train = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')
test_VOC = prepare_data('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_VOC.txt')

  0%|          | 0/8040 [00:00<?, ?it/s]

  0%|          | 0/2150 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

In [5]:
label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

In [6]:
label_list

['B-LOC', 'B-PER', 'B-TIME', 'I-LOC', 'I-PER', 'I-TIME', 'O']

In [6]:
train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = convert_to_dataset(test_VOC, label_map)

datasets = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data,
})


In [7]:
## Evaluation

In [8]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [9]:
model_name = "/ivi/ilps/personal/vprovat/KB/GysBERT"
# model_name = "bert-base-multilingual-cased"
# model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_list))

Some weights of the model checkpoint at /ivi/ilps/personal/vprovat/KB/GysBERT were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not 

In [10]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)




Map:   0%|          | 0/11104 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

In [12]:
from transformers import EarlyStoppingCallback, IntervalStrategy

training_args = TrainingArguments(
    output_dir="/ivi/ilps/personal/vprovat/KB/NER_logs_GysBERT", #change here
    evaluation_strategy=IntervalStrategy.STEPS, #"steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    save_total_limit = 15,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [28]:
# dir(training_args)

In [13]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [17]:
trainer.train(resume_from_checkpoint=True)

  0%|          | 0/120 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
14500,0.0136,0.144989,0.679414,0.697818,0.688493,precision recall f1-score support  LOC 0.74 0.80 0.77 7403  PER 0.67 0.67 0.67 14386  TIME 0.61 0.63 0.62 4791  micro avg 0.68 0.70 0.69 26580  macro avg 0.67 0.70 0.69 26580 weighted avg 0.68 0.70 0.69 26580


Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.74      0.80      0.77      7403
         PER       0.67      0.67      0.67     14386
        TIME       0.61      0.63      0.62      4791

   micro avg       0.68      0.70      0.69     26580
   macro avg       0.67      0.70      0.69     26580
weighted avg       0.68      0.70      0.69     26580
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=14500, training_loss=0.0004465937861080827, metrics={'train_runtime': 293.1253, 'train_samples_per_second': 568.221, 'train_steps_per_second': 71.028, 'total_flos': 3.0311793954816e+16, 'train_loss': 0.0004465937861080827, 'epoch': 10.45})

In [18]:
trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id

In [19]:
trainer.save_model('/ivi/ilps/personal/vprovat/KB/models/GysBERT-NER-v2')