In [6]:
# Kernel: Python (transformers)

In [1]:
import glob

In [2]:
# !pip install seqeval
# !pip install transformers
# !pip install datasets
# import sys  
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [3]:
import sys
# !{sys.executable} -m pip install transformers==4.28.0
# !{sys.executable} -m pip install seqeval

In [4]:
# import sys
# !{sys.executable} -m pip install wandb

In [5]:
import transformers

transformers.__version__

'4.28.0'

In [6]:
! wandb login

[34m[1mwandb[0m: Currently logged in as: [33mvsprovatorova[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# import wandb
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="kb-ner",
    
# #     # track hyperparameters and run metadata
# #     config={
# #     "learning_rate": 0.02,
# #     "architecture": "CNN",
# #     "dataset": "CIFAR-100",
# #     "epochs": 10,
# #     }
# )

In [8]:
for filename in glob.glob('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/*'):
    print(filename)

/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/transkribus
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_VOC.txt
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_NHA.txt
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_SA.txt
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_RHC.txt
/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/vocab.txt


In [9]:
def read_conll_file(file_path):
    with open(file_path, "rb") as f:
        content = f.read().decode(errors='replace').strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [10]:
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence if token_data]
        ner_tags = [label_map[token_data[1]] for token_data in sentence if token_data]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)

In [11]:

train = read_conll_file('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/train.txt')
val = read_conll_file('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/validation.txt')
test_VOC = read_conll_file('/ivi/ilps/personal/vprovat/KB/data/AITrainingset/Data/test_VOC.txt')

In [12]:
label_list = sorted(list(set([token_data[1] for sentence in train for token_data in sentence if token_data])))
label_map = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

In [13]:
label_map

{'B-LOC': 0,
 'B-PER': 1,
 'B-TIME': 2,
 'I-LOC': 3,
 'I-PER': 4,
 'I-TIME': 5,
 'O': 6}

In [14]:
train_data = convert_to_dataset(train, label_map)
val_data = convert_to_dataset(val, label_map)
test_data = convert_to_dataset(test_VOC, label_map)

In [15]:
# model_name = "/ivi/ilps/personal/vprovat/KB/GysBERT"
model_name = "bert-base-multilingual-cased"
# model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_list))
# model.num_labels = len(label_list)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [16]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [17]:
datasets = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data,
})


In [18]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)


training_args = TrainingArguments(
    output_dir="/ivi/ilps/personal/vprovat/KB/NER_logs",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

Map:   0%|          | 0/8040 [00:00<?, ? examples/s]

Map:   0%|          | 0/2150 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [19]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [21]:
trainer.train(resume_from_checkpoint = False)

[34m[1mwandb[0m: Currently logged in as: [33mvsprovatorova[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
500,0.1383,0.116527,0.630705,0.571337,0.599555,precision recall f1-score support  LOC 0.72 0.61 0.66 6323  PER 0.61 0.56 0.59 11586  TIME 0.56 0.54 0.55 4029  micro avg 0.63 0.57 0.60 21938  macro avg 0.63 0.57 0.60 21938 weighted avg 0.63 0.57 0.60 21938
1000,0.1085,0.109948,0.617807,0.625946,0.62185,precision recall f1-score support  LOC 0.66 0.73 0.69 6323  PER 0.61 0.60 0.60 11586  TIME 0.56 0.55 0.55 4029  micro avg 0.62 0.63 0.62 21938  macro avg 0.61 0.63 0.62 21938 weighted avg 0.62 0.63 0.62 21938
1500,0.0865,0.10266,0.643369,0.659176,0.651176,precision recall f1-score support  LOC 0.67 0.76 0.71 6323  PER 0.64 0.63 0.63 11586  TIME 0.61 0.60 0.60 4029  micro avg 0.64 0.66 0.65 21938  macro avg 0.64 0.66 0.65 21938 weighted avg 0.64 0.66 0.65 21938
2000,0.0836,0.086926,0.719388,0.658264,0.68747,precision recall f1-score support  LOC 0.77 0.73 0.75 6323  PER 0.71 0.64 0.67 11586  TIME 0.66 0.61 0.64 4029  micro avg 0.72 0.66 0.69 21938  macro avg 0.72 0.66 0.69 21938 weighted avg 0.72 0.66 0.69 21938
2500,0.0639,0.091851,0.701999,0.683472,0.692611,precision recall f1-score support  LOC 0.74 0.76 0.75 6323  PER 0.69 0.66 0.67 11586  TIME 0.66 0.63 0.65 4029  micro avg 0.70 0.68 0.69 21938  macro avg 0.70 0.68 0.69 21938 weighted avg 0.70 0.68 0.69 21938
3000,0.0667,0.089597,0.704582,0.682742,0.69349,precision recall f1-score support  LOC 0.72 0.76 0.74 6323  PER 0.71 0.66 0.68 11586  TIME 0.67 0.63 0.65 4029  micro avg 0.70 0.68 0.69 21938  macro avg 0.70 0.68 0.69 21938 weighted avg 0.70 0.68 0.69 21938
3500,0.0542,0.093565,0.693991,0.693865,0.693928,precision recall f1-score support  LOC 0.72 0.77 0.74 6323  PER 0.69 0.67 0.68 11586  TIME 0.65 0.64 0.65 4029  micro avg 0.69 0.69 0.69 21938  macro avg 0.69 0.69 0.69 21938 weighted avg 0.69 0.69 0.69 21938
4000,0.0559,0.092818,0.703551,0.690081,0.696751,precision recall f1-score support  LOC 0.74 0.77 0.76 6323  PER 0.69 0.67 0.68 11586  TIME 0.66 0.62 0.64 4029  micro avg 0.70 0.69 0.70 21938  macro avg 0.70 0.69 0.69 21938 weighted avg 0.70 0.69 0.70 21938
4500,0.0431,0.100574,0.704125,0.690902,0.697451,precision recall f1-score support  LOC 0.73 0.78 0.76 6323  PER 0.70 0.66 0.68 11586  TIME 0.66 0.63 0.64 4029  micro avg 0.70 0.69 0.70 21938  macro avg 0.70 0.69 0.69 21938 weighted avg 0.70 0.69 0.70 21938
5000,0.0456,0.099842,0.71468,0.685295,0.699679,precision recall f1-score support  LOC 0.73 0.78 0.75 6323  PER 0.73 0.66 0.69 11586  TIME 0.66 0.62 0.64 4029  micro avg 0.71 0.69 0.70 21938  macro avg 0.70 0.69 0.69 21938 weighted avg 0.71 0.69 0.70 21938


Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.72      0.61      0.66      6323
         PER       0.61      0.56      0.59     11586
        TIME       0.56      0.54      0.55      4029

   micro avg       0.63      0.57      0.60     21938
   macro avg       0.63      0.57      0.60     21938
weighted avg       0.63      0.57      0.60     21938
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.66      0.73      0.69      6323
         PER       0.61      0.60      0.60     11586
        TIME       0.56      0.55      0.55      4029

   micro avg       0.62      0.63      0.62     21938
   macro avg       0.61      0.63      0.62     21938
weighted avg       0.62      0.63      0.6

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.73      0.77      0.75      6323
         PER       0.67      0.67      0.67     11586
        TIME       0.63      0.63      0.63      4029

   micro avg       0.68      0.69      0.69     21938
   macro avg       0.68      0.69      0.68     21938
weighted avg       0.68      0.69      0.69     21938
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.75      0.77      0.76      6323
         PER       0.70      0.65      0.68     11586
        TIME       0.63      0.63      0.63      4029

   micro avg       0.70      0.68      0.69     21938
   macro avg       0.69      0.69      0.69     21938
weighted avg       0.70      0.68      0.6

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.75      0.78      0.77      6323
         PER       0.68      0.65      0.66     11586
        TIME       0.63      0.63      0.63      4029

   micro avg       0.69      0.68      0.69     21938
   macro avg       0.69      0.69      0.69     21938
weighted avg       0.69      0.68      0.69     21938
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.76      0.77      0.77      6323
         PER       0.68      0.65      0.67     11586
        TIME       0.63      0.63      0.63      4029

   micro avg       0.69      0.68      0.69     21938
   macro avg       0.69      0.69      0.69     21938
weighted avg       0.69      0.68      0.6

TrainOutput(global_step=15075, training_loss=0.040053038440928924, metrics={'train_runtime': 13317.2331, 'train_samples_per_second': 9.056, 'train_steps_per_second': 1.132, 'total_flos': 3.15138133702656e+16, 'train_loss': 0.040053038440928924, 'epoch': 15.0})

In [22]:
trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id

In [23]:
id2label

{0: 'B-LOC',
 1: 'B-PER',
 2: 'B-TIME',
 3: 'I-LOC',
 4: 'I-PER',
 5: 'I-TIME',
 6: 'O'}

In [24]:
sentence = "Wilhelmina van Oosten woont in Den Haag, en ik woon in Bussum"


tokenized_input = tokenizer(sentence, return_tensors="pt").to(model.device)

print(tokenized_input)
outputs = model(**tokenized_input)

# print(outputs)
# print( outputs.logits.argmax(-1))
predicted_labels = outputs.logits.argmax(-1)[0]
print(predicted_labels)


named_entities = [(tokenizer.decode([token]), 
                   id2label[int(label)]
                  ) 
                  for token, label in zip(tokenized_input["input_ids"][0], predicted_labels)]


print("Named Entities - Example 1:", named_entities)



{'input_ids': tensor([[  101, 96074, 10145, 85174, 12796, 18999, 10106, 10235, 30724,   117,
         10110, 47458, 12796, 10263, 10106, 29485, 31417,   102]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}
tensor([6, 6, 6, 4, 6, 6, 6, 6, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6], device='cuda:0')
Named Entities - Example 1: [('[CLS]', 'O'), ('Wilhelmina', 'O'), ('van', 'O'), ('Oosten', 'I-PER'), ('wo', 'O'), ('##ont', 'O'), ('in', 'O'), ('Den', 'O'), ('Haag', 'B-LOC'), (',', 'O'), ('en', 'O'), ('ik', 'O'), ('wo', 'O'), ('##on', 'O'), ('in', 'O'), ('Bus', 'O'), ('##sum', 'O'), ('[SEP]', 'O')]


In [25]:
len(datasets['train'])

8040

In [26]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_auto_class',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_convert_head_mask_to_5d',
 '_create_repo',
 '_expand_inputs_for_generation',
 '_extract_past_from_model_output',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_from_config',
 '_get_backward_hooks',
 '_get_decoder_start_token_id',
 '_get_files_timestamps',
 '_get_logits_processor',
 '_get_logits_warper',
 '_get_name',
 '_get_resized_embeddings',
 '_get_resized_lm_head',
 '_get_stopping_criteria',
 '_hook_rss_memory_p

In [27]:
dir(trainer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_sm_patterns_to_gitignore',
 '_gather_and_numpify',
 '_get_collator_with_removed_columns',
 '_get_eval_sampler',
 '_get_learning_rate',
 '_get_output_dir',
 '_get_train_sampler',
 '_globalstep_last_logged',
 '_hp_search_setup',
 '_inner_training_loop',
 '_load_best_model',
 '_load_from_checkpoint',
 '_load_optimizer_and_scheduler',
 '_load_rng_state',
 '_loggers_initialized',
 '_maybe_log_save_evaluate',
 '_memory_tracker',
 '_move_model_to_device',
 '_nested_gather',
 '_pad_across_processes',
 '_prepare_input',
 '_prepare_inputs',
 '_push_from_checkpoint',
 '_remove_unused_columns',
 '_report_to_hp_search',
 '_rotat

In [28]:
model.label_map = id2label

In [29]:
id2label

{0: 'B-LOC',
 1: 'B-PER',
 2: 'B-TIME',
 3: 'I-LOC',
 4: 'I-PER',
 5: 'I-TIME',
 6: 'O'}

In [30]:
trainer.save_model('/ivi/ilps/personal/vprovat/KB/models/BERT-multi-NER-v1')

In [3]:
from transformers import pipeline


tokenizer = AutoTokenizer.from_pretrained("/ivi/ilps/personal/vprovat/KB/models/BERT-multi-NER-v1")
model = AutoModelForTokenClassification.from_pretrained("/ivi/ilps/personal/vprovat/KB/models/BERT-multi-NER-v1")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Ik ben Maurits van Bussum en ik woon in Zandvoort aan zee sinds 1987"
ner_results = nlp(example)

res =[post_process(item) for item in ner_results]
res

NameError: name 'AutoTokenizer' is not defined