Starting from a 
* pre-trained model with a 
* pre-trained tokenizer

we perform finetuning on a negation task

In [1]:
%load_ext autoreload
%autoreload 2

import os 
import numpy as np
import random
from tqdm import tqdm

from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from torch import device, cuda, version

import apex

import dcc_splitter as splitter
import ner_training as trainer
import pandas as pd

import seaborn
import matplotlib.pyplot as plt

from collections import namedtuple

from transformers import AutoTokenizer, RobertaTokenizer, AutoModelForTokenClassification

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [3]:
device = device("cuda:0") if cuda.is_available() else device("cpu")

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
device

device(type='cpu')

In [5]:
dcc_dir = None
output_dir = None
skip_file = None
n_splits = 10
random_state = None
base_folder = "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels"
output_folder = "fine_tuned_token_classification"
mod_name = "robbert-v2-dutch-base" # "robbert-v2-dutch-base" # belabBERT_115k # bert-base-dutch
 

args = namedtuple
args.task = "negation" # experiencer, temporality
args.model_path = os.path.join(base_folder, mod_name)
args.model_type = "roberta" # bertje 
args.output_dir = os.path.join(base_folder, output_folder)
args.num_epochs = 2
args.eval_steps = 10 
args.lr = 5e-5
args.batch_size= 16
args.gradient_accumulation_steps=1
args.block_size = 32 # block size determines inclusion 
args.save_model=False
args.bio=True
args.do_eval=False
args.do_write=False
args.bootstrap=False
args.do_print_class_report=False

random.seed(77)

In [6]:
# args.block_size determines how many text snippets are  used for training, see ner_training.py lines 118--141
# obviously this is a code-design flaw that should be mended.
# the dataset loader should  include the id_begin_end in the output

In [7]:
# dcc-splitter for folds
dcc_splitter = splitter.DCCSplitter(dcc_dir, output_dir, skip_file, n_splits, random_state, write_to_file=False)
splits = dcc_splitter.split()

In [8]:
# load NER DCC set
dcc = pd.read_csv("../data/RobBERT/DCC_df.csv", 
                  sep="\t", 
                  skip_blank_lines=True, 
                  engine="python", 
                  encoding="latin-1",
                  on_bad_lines="warn", 
                  keep_default_na=False)

In [9]:
dcc.Negation.value_counts()

O             146395
NotNegated     15713
Negated         3017
Name: Negation, dtype: int64

In [10]:
dcc.loc[dcc.BIO!='O'][['Id', 'Begin', 'End']].apply(lambda x: "_".join(x), axis=1).nunique()

11882

In [11]:
Texts = dcc.groupby('Id').Word.apply(lambda x: " ".join(x))

In [12]:
tag_ids = {'negation':{'B-Negated':0,'B-NotNegated':1,'I-Negated':2,'I-NotNegated':3},
          'temporality':{'B-Recent':0,'B-Historical':1,'B-Hypothetical':2,'I-Recent':3,
                         'I-Historical':4,'I-Hypothetical':5},
          'experiencer':{'B-Patient':0,'B-Other':1,'I-Patient':2,'I-Other':3}}

tag2id = tag_ids[args.task]
tokenizer = AutoTokenizer.from_pretrained(args.model_path)

## Over all document sources

improvement: only output best model based on validation scores

In [13]:
# cycle through folds
scores = []
predlist = []
test_lists = []
loss_history = {}
for idx, fold in tqdm(enumerate(splits)):
    # re-init model for each fold, otherwise it keeps on training the same throughout all folds..
    token_model = AutoModelForTokenClassification.from_pretrained(args.model_path, num_labels = len(tag2id))
    
    train_list, test_list = fold['train'], fold['test']
    
    ## eval is optional (to gauge the best number of steps/epochs)
    eval_list = random.choices(train_list,k=int(len(train_list)/10)) if args.do_eval else []
    eval_dcc = dcc.loc[dcc.Id.isin(eval_list)]
    test_dcc = dcc.loc[dcc.Id.isin(test_list)]
    train_dcc = dcc.loc[(dcc.Id.isin(train_list)) & (~dcc.Id.isin(eval_list))]
    
    test_list = test_dcc.Id.tolist()
    eval_list = eval_dcc.Id.tolist()

    ###
    train_dataset = trainer.TextDatasetFromDataFrame(train_dcc, tokenizer, args) 
    test_dataset = trainer.TextDatasetFromDataFrame(test_dcc, tokenizer, args)
    eval_dataset = trainer.TextDatasetFromDataFrame(eval_dcc, tokenizer, args)
    
    args.do_print_class_report=False
    # Train on all document sources
    trained_model, eval_loss_history = trainer.train_model(model=token_model.to(device), 
                                                            tokenizer=tokenizer, 
                                                            train_dataset=train_dataset, 
                                                            eval_dataset=eval_dataset, 
                                                            tag2id=tag2id,
                                                            device=device, 
                                                            args=args,
                                                            max_grad_norm=1.0,
                                                            amp=False)
    args.do_print_class_report=True
    # Evaluate on all document sources
    f1, prec, rec, preds, truth, test_ids = trainer.eval_model(model=trained_model, 
                                       tokenizer=tokenizer, 
                                       eval_dataset=test_dataset, 
                                       tag2id=tag2id, 
                                       device=device, 
                                       args=args, 
                                       return_pred=True)
    
    loss_history[idx]=eval_loss_history
    
    #test_ids = ["_".join(t) for t in zip(test_dcc.Id, test_dcc.Begin, test_dcc.End)]
    scores.append({'fold': idx, 'f1': f1, 'precision': prec, 'recall': rec})
    predlist.append({'fold': idx, 'prediction': preds, 'truth': truth, 'ids': test_ids})
    test_lists.append(test_ids)
    

0it [00:00, ?it/s]Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [06:29<00:00,  1.29s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [06:37<00:00,  1.32s/it]


Training finished, best model f = 0.000


1it [14:11, 851.36s/it]

F1: 0.895 
              precision    recall  f1-score   support

     Negated       0.86      0.88      0.87       108
  NotNegated       0.87      0.92      0.90       573

   micro avg       0.87      0.92      0.89       681
   macro avg       0.87      0.90      0.89       681
weighted avg       0.87      0.92      0.89       681



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [06:41<00:00,  1.33s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [06:37<00:00,  1.32s/it]


Training finished, best model f = 0.000


2it [28:28, 854.83s/it]

F1: 0.906 
              precision    recall  f1-score   support

     Negated       0.88      0.93      0.90        83
  NotNegated       0.90      0.92      0.91       555

   micro avg       0.89      0.92      0.91       638
   macro avg       0.89      0.92      0.90       638
weighted avg       0.89      0.92      0.91       638



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [06:43<00:00,  1.34s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:53<00:00,  1.17s/it]


Training finished, best model f = 0.000


3it [42:05, 837.29s/it]

F1: 0.908 
              precision    recall  f1-score   support

     Negated       0.95      0.93      0.94       107
  NotNegated       0.88      0.93      0.90       575

   micro avg       0.89      0.93      0.91       682
   macro avg       0.92      0.93      0.92       682
weighted avg       0.89      0.93      0.91       682



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:49<00:00,  1.16s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:49<00:00,  1.16s/it]


Training finished, best model f = 0.000


4it [54:36, 803.51s/it]

F1: 0.894 
              precision    recall  f1-score   support

     Negated       0.86      0.84      0.85        90
  NotNegated       0.88      0.92      0.90       569

   micro avg       0.88      0.91      0.89       659
   macro avg       0.87      0.88      0.88       659
weighted avg       0.88      0.91      0.89       659



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:50<00:00,  1.16s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:43<00:00,  1.14s/it]


Training finished, best model f = 0.000


5it [1:07:03, 783.13s/it]

F1: 0.899 
              precision    recall  f1-score   support

     Negated       0.92      0.89      0.90        90
  NotNegated       0.88      0.92      0.90       566

   micro avg       0.88      0.92      0.90       656
   macro avg       0.90      0.91      0.90       656
weighted avg       0.88      0.92      0.90       656



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:45<00:00,  1.14s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:48<00:00,  1.15s/it]


Training finished, best model f = 0.000


6it [1:19:31, 770.96s/it]

F1: 0.917 
              precision    recall  f1-score   support

     Negated       0.81      0.83      0.82        90
  NotNegated       0.92      0.95      0.93       585

   micro avg       0.90      0.93      0.92       675
   macro avg       0.86      0.89      0.88       675
weighted avg       0.90      0.93      0.92       675



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:50<00:00,  1.16s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:44<00:00,  1.14s/it]


Training finished, best model f = 0.000


7it [1:31:59, 763.61s/it]

F1: 0.915 
              precision    recall  f1-score   support

     Negated       0.89      0.92      0.90       101
  NotNegated       0.91      0.93      0.92       583

   micro avg       0.90      0.93      0.91       684
   macro avg       0.90      0.92      0.91       684
weighted avg       0.90      0.93      0.91       684



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:47<00:00,  1.15s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:42<00:00,  1.13s/it]


Training finished, best model f = 0.000


8it [1:44:22, 756.89s/it]

F1: 0.912 
              precision    recall  f1-score   support

     Negated       0.93      0.86      0.89       102
  NotNegated       0.90      0.93      0.92       579

   micro avg       0.90      0.92      0.91       681
   macro avg       0.91      0.90      0.90       681
weighted avg       0.90      0.92      0.91       681



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:45<00:00,  1.14s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:46<00:00,  1.15s/it]


Training finished, best model f = 0.000


9it [1:56:47, 753.28s/it]

F1: 0.904 
              precision    recall  f1-score   support

     Negated       0.87      0.86      0.87        88
  NotNegated       0.90      0.92      0.91       594

   micro avg       0.89      0.91      0.90       682
   macro avg       0.88      0.89      0.89       682
weighted avg       0.89      0.91      0.90       682



Some weights of the model checkpoint at /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /media/koekiem

RobertaConfig {
  "_name_or_path": "/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "v

Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:43<00:00,  1.14s/it]
Epoch 2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [05:46<00:00,  1.15s/it]


Training finished, best model f = 0.000


10it [2:09:09, 774.99s/it]

F1: 0.885 
              precision    recall  f1-score   support

     Negated       0.88      0.94      0.91        98
  NotNegated       0.87      0.89      0.88       546

   micro avg       0.87      0.90      0.89       644
   macro avg       0.88      0.91      0.90       644
weighted avg       0.87      0.90      0.89       644






In [14]:
predlist_prep = []
robbert_col_name = 'robbert_'+str(args.block_size)+'_'+str(args.num_epochs)

for foldnum, foldres in enumerate(predlist):
    ids = foldres['ids']
    for prs, trs, ids in zip(foldres['prediction'], foldres['truth'], foldres['ids']):
        for pr, tr, _id in zip(prs, trs, ids):
            tmp_dict={}
            if len(pr)==len(tr)==0:
                tmp_dict['fold'] = foldnum
                tmp_dict['entity_id'] = _id
                tmp_dict['label'] = "n/a"
                tmp_dict[robbert_col_name] = "n/a"
            elif len(pr)>0:
                tmp_dict['fold'] = foldnum
                tmp_dict['entity_id'] = _id
                tmp_dict['label'] = tr
                tmp_dict[robbert_col_name] = pr                
            else:
                raise ValueError("predictions are empty while truth is not")    
            predlist_prep.append(tmp_dict)
predlist_df = pd.DataFrame(predlist_prep)
predlist_df['bio_label'] = predlist_df['label'].str.replace(r"([BI])\-[A-z]+", "\\1", 
                                                        regex=True, case=True).str.strip()
predlist_df['bio_robbert'] = predlist_df[robbert_col_name].str.replace(r"([BI])\-[A-z]+", "\\1", 
                                                        regex=True, case=True).str.strip()

bio_pred = predlist_df[['entity_id', 'bio_label', 'bio_robbert']]
predlist_df = predlist_df.loc[predlist_df.bio_label=='B']
predlist_df.drop(['bio_label', 'bio_robbert', 'fold'], axis=1, inplace=True)

In [15]:
predlist_df['label'] = predlist_df.label.map({'B-NotNegated': 'not negated', 'B-Negated': 'negated'})
predlist_df[robbert_col_name] = predlist_df[robbert_col_name].map({'B-NotNegated': 'not negated', 'B-Negated': 'negated'})

In [16]:
predlist_df

Unnamed: 0,entity_id,label,robbert_32_2
0,DL1667_66_74,not negated,not negated
1,DL1866_88_99,not negated,not negated
2,DL1614_5_12,negated,negated
4,DL1614_41_51,negated,negated
6,DL1614_62_71,negated,negated
...,...,...,...
10951,SP1751_18_33,not negated,not negated
10952,SP1751_60_78,not negated,not negated
10954,SP2037_49_73,negated,negated
10957,SP1321_23_30,negated,negated


In [17]:
test_scores = pd.DataFrame(scores)

In [18]:
if args.do_eval:
    dfl = []
    for i in range(2):
        df = pd.DataFrame(loss_history[i])
        df['fold']=i
        dfl.append(df)
    eval_history = pd.concat(dfl).reset_index()
    eval_history['step'] = eval_history['step'].astype(int)
    eval_history['fold'] = eval_history['fold'].astype(int)

    fig, ax = plt.subplots(ncols=3, figsize=(18,5))
    seaborn.lineplot(data=eval_history, x='step', y='f1', hue='epoch', ax=ax[0])
    seaborn.lineplot(data=eval_history, x='step', y='recall', hue='epoch', ax=ax[1])
    seaborn.lineplot(data=eval_history, x='step', y='precision', hue='epoch', ax=ax[2])

In [19]:
# micro-averaged scores
test_scores

Unnamed: 0,fold,f1,precision,recall
0,0,0.894775,0.872905,0.917768
1,1,0.905573,0.894495,0.916928
2,2,0.908178,0.889045,0.928152
3,3,0.894188,0.878477,0.91047
4,4,0.899178,0.881406,0.917683
5,5,0.917458,0.904899,0.93037
6,6,0.91474,0.904286,0.925439
7,7,0.912255,0.901146,0.923642
8,8,0.903693,0.892704,0.914956
9,9,0.885145,0.873112,0.897516


In [20]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [21]:
# B-NotNegated, B-Negated
labmap = {'B-NotNegated': False, 'B-Negated': True, 'I-NotNegated': False, 'I-Negated': True}

manual_scores = []
confusion_matrix = []
for i in range(len(predlist)):
    # accuracy over all documents, flattened
    _predlist = [(labmap[t], 'B' if 'B-' in t else 'I') for l in predlist[i]['prediction'] for t in l if len(l)>0]
    _truthlist = [(labmap[t], 'B' if 'B-' in t else 'I') for l in predlist[i]['truth'] for t in l if len(l)>0]

    tr_c = []
    pr_c = []
    tr_r = []
    pr_r = []

    b_truth = []
    b_pred = []
    for _t,_p in zip(_truthlist, _predlist):
        if _t[1]==_p[1]=='B':
            tr_c.append(_t[0])
            pr_c.append(_p[0])
        tr_r.append(_t[0])
        pr_r.append(_p[0])

        b_truth.append(_t[1]=='B')
        b_pred.append(_p[1]=='B')

    tr_c, pr_c, tr_r, pr_r = np.array(tr_c), np.array(pr_c), np.array(tr_r), np.array(pr_r)
    b_truth, b_pred = np.array(b_truth), np.array(b_pred)

    TN_c = np.sum((pr_c==tr_c) & (pr_c==False))
    TP_c = np.sum((pr_c==tr_c) & (pr_c==True))
    FP_c = np.sum((pr_c!=tr_c) & (pr_c==True))
    FN_c = np.sum((pr_c!=tr_c) & (pr_c==False))

    TN_r = np.sum((pr_r==tr_r) & (pr_r==False))
    TP_r = np.sum((pr_r==tr_r) & (pr_r==True))
    FP_r = np.sum((pr_r!=tr_r) & (pr_r==True))
    FN_r = np.sum((pr_r!=tr_r) & (pr_r==False))

    TN_b = np.sum((b_pred==b_truth) & (b_pred==False))
    TP_b = np.sum((b_pred==b_truth) & (b_pred==True))
    FP_b = np.sum((b_pred!=b_truth) & (b_pred==True))
    FN_b = np.sum((b_pred!=b_truth) & (b_pred==False))


    # micro
    f1 = f1_score(tr_r, pr_r, average='micro')
    precision = precision_score(tr_r, pr_r, average='micro')
    recall = recall_score(tr_r, pr_r, average='micro')
    manual_scores.append({'list': 'raw', 
                          'fold': i, 
                          'focus': 'micro', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # macro
    f1 = f1_score(tr_r, pr_r, average='macro')
    precision = precision_score(tr_r, pr_r, average='macro')
    recall = recall_score(tr_r, pr_r, average='macro')
    manual_scores.append({'list': 'raw', 
                          'fold': i, 
                          'focus': 'macro', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # Negated
    f1 = f1_score(tr_r, pr_r)
    precision = precision_score(tr_r, pr_r)
    recall = recall_score(tr_r, pr_r)
    manual_scores.append({'list': 'raw', 
                          'fold': i, 
                          'focus': 'negated', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # NotNegated
    f1 = f1_score(~tr_r, ~pr_r)
    precision = precision_score(~tr_r, ~pr_r)
    recall = recall_score(~tr_r, ~pr_r)
    manual_scores.append({'list': 'raw',
                          'fold': i,
                          'focus': 'notnegated', 
                          'f1': f1,
                          'precision': precision,
                          'recall': recall})
    
    confusion_matrix.append({'list': 'raw', 'fold': i, 'TN': TN_r, 'TP': TP_r, 'FN': FN_r, 'FP': FP_r})    
    ######################################
    # micro
    f1 = f1_score(tr_c, pr_c, average='micro')
    precision = precision_score(tr_c, pr_c, average='micro')
    recall = recall_score(tr_c, pr_c, average='micro')
    manual_scores.append({'list': 'clean', 
                          'fold': i, 
                          'focus': 'micro', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # macro
    f1 = f1_score(tr_c, pr_c, average='macro')
    precision = precision_score(tr_c, pr_c, average='macro')
    recall = recall_score(tr_c, pr_c, average='macro')
    manual_scores.append({'list': 'clean', 
                          'fold': i, 
                          'focus': 'macro', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # Negated
    f1 = f1_score(tr_c, pr_c)
    precision = precision_score(tr_c, pr_c)
    recall = recall_score(tr_c, pr_c)
    manual_scores.append({'list': 'clean', 
                          'fold': i, 
                          'focus': 'negated', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # NotNegated
    f1 = f1_score(~tr_c, ~pr_c)
    precision = precision_score(~tr_c, ~pr_c)
    recall = recall_score(~tr_c, ~pr_c)
    manual_scores.append({'list': 'clean',
                          'fold': i,
                          'focus': 'notnegated', 
                          'f1': f1,
                          'precision': precision,
                          'recall': recall})
    
    confusion_matrix.append({'list': 'clean', 'fold': i, 'TN': TN_c, 'TP': TP_c, 'FN': FN_c, 'FP': FP_c})    
    ######################################
    # micro
    f1 = f1_score(b_truth, b_pred, average='micro')
    precision = precision_score(b_truth, b_pred, average='micro')
    recall = recall_score(b_truth, b_pred, average='micro')
    manual_scores.append({'list': 'B_I', 
                          'fold': i, 
                          'focus': 'micro', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # macro
    f1 = f1_score(b_truth, b_pred, average='macro')
    precision = precision_score(b_truth, b_pred, average='macro')
    recall = recall_score(b_truth, b_pred, average='macro')
    manual_scores.append({'list': 'B_I', 
                          'fold': i, 
                          'focus': 'macro', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # Negated
    f1 = f1_score(b_truth, b_pred)
    precision = precision_score(b_truth, b_pred)
    recall = recall_score(b_truth, b_pred)
    manual_scores.append({'list': 'B_I', 
                          'fold': i, 
                          'focus': 'negated', 
                          'f1': f1, 
                          'precision': precision, 
                          'recall': recall})

    # NotNegated
    f1 = f1_score(~b_truth, ~b_pred)
    precision = precision_score(~b_truth, ~b_pred)
    recall = recall_score(~b_truth, ~b_pred)
    manual_scores.append({'list': 'B_I',
                          'fold': i,
                          'focus': 'notnegated', 
                          'f1': f1,
                          'precision': precision,
                          'recall': recall})
    
    confusion_matrix.append({'list': 'B_I', 'fold': i, 'TN': TN_b, 'TP': TP_b, 'FN': FN_b, 'FP': FP_b})    
    
manual_scores_df = pd.DataFrame(data=manual_scores)
confusion_matrix_df = pd.DataFrame(data=confusion_matrix)

In [22]:
manual_scores_df.groupby(['list', 'focus']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,fold,f1,precision,recall
list,focus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B_I,macro,4.5,0.950734,0.951559,0.950061
B_I,micro,4.5,0.952914,0.952914,0.952914
B_I,negated,4.5,0.961062,0.957913,0.964303
B_I,notnegated,4.5,0.940406,0.945204,0.935819
clean,macro,4.5,0.963929,0.971918,0.956746
clean,micro,4.5,0.982372,0.982372,0.982372
clean,negated,4.5,0.938151,0.957107,0.920764
clean,notnegated,4.5,0.989706,0.986728,0.992728
raw,macro,4.5,0.964603,0.969298,0.960343
raw,micro,4.5,0.98228,0.98228,0.98228


## Append to other results 

In [23]:
merged_result_file = '../results/merged_results_new.csv.gz'
results = pd.read_csv(merged_result_file)

In [24]:
entities = results.entity_id.str.replace(r"\_[0-9]+\_[0-9]+", "").unique()
len(entities)

  entities = results.entity_id.str.replace(r"\_[0-9]+\_[0-9]+", "").unique()


5365

In [25]:
sanity_check = predlist_df[['entity_id', 'label']].set_index('entity_id').join(results[['entity_id', 'label']].set_index('entity_id'),
                                                                how='inner',rsuffix='_or')

(sanity_check['label'] == sanity_check['label_or']).sum()==sanity_check.shape[0]

True

In [26]:
predlist_df.set_index('entity_id', inplace=True)
results.set_index('entity_id', inplace=True)

In [27]:
predlist_df[[robbert_col_name]]

Unnamed: 0_level_0,robbert_32_2
entity_id,Unnamed: 1_level_1
DL1667_66_74,not negated
DL1866_88_99,not negated
DL1614_5_12,negated
DL1614_41_51,negated
DL1614_62_71,negated
...,...
SP1751_18_33,not negated
SP1751_60_78,not negated
SP2037_49_73,negated
SP1321_23_30,negated


In [28]:
total_results = results.join(predlist_df[[robbert_col_name]], how='left')

In [32]:
total_results.to_csv("../results/merged_results_new.csv.gz", index=True, compression='gzip')

In [33]:
robberts = [c for c in total_results.columns if 'robbert' in c]
unanimous = total_results.dropna()[['label', 'bilstm_cv', 'rule_based']+robberts].apply(lambda x: x[0]==x[1]==x[2]==x[3]==x[4], 
                                                                         axis=1)

In [34]:
sum(unanimous)/total_results.loc[total_results[robbert_col_name].isna()==False].shape[0]

0.9362907031618688

In [39]:
def number_of_dissenters(x):
    return int(x[0] != x[1])+\
           int(x[0] != x[2])+\
           int(x[0] != x[3])+\
           int(x[0] != x[4])+\
           int(x[0] != x[5])+\
           int(x[0] != x[6])

In [40]:
dissenters = total_results.dropna()[['label','bilstm', 'bilstm_cv','rule_based']+robberts]\
                            .apply(number_of_dissenters, axis=1)

total_results.dissenters = np.nan
total_results.loc[dissenters.index, 'dissenters'] = dissenters.astype(int)

In [37]:
# 80*2*10/60 hours for 2 epochs and block size 512
# 16*2*10/60 hours for 2 epochs and block size 128
# 6*2*10/60 hours for 2 epochs and block size 32