In [1]:
# !pip install --upgrade --upgrade-strategy only-if-needed evaluate seqeval pytorch-crf

In [2]:
# seed everything for reproductibility
import random
random.seed(0)
import numpy as np
np.random.seed(0)
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x78b73622d570>

In [3]:
import json
from IPython.display import display, HTML
import pandas as pd
pd.options.display.float_format = "{:,.3f}".format

import torch.nn as nn

from datasets.features.features import Features, Sequence, ClassLabel, Value
from datasets import load_dataset, concatenate_datasets
import transformers
from transformers import DataCollatorForTokenClassification, DefaultDataCollator
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import pipeline
import evaluate

from torchcrf import CRF


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

# I. Dataset Analysis

In [5]:
ds = load_dataset("DrBenchmark/QUAERO", "medline", trust_remote_code=True)
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'tokens', 'ner_tags'],
        num_rows: 833
    })
    validation: Dataset({
        features: ['id', 'document_id', 'tokens', 'ner_tags'],
        num_rows: 832
    })
    test: Dataset({
        features: ['id', 'document_id', 'tokens', 'ner_tags'],
        num_rows: 833
    })
})

In [6]:
label_names = ds['train'].features["ner_tags"].feature.names
tags_names = [l_.replace('B-', '') for l_ in label_names if 'B-' in l_]
id2label= {id_: tag_ for id_,tag_ in enumerate(label_names)}
label2id = {tag_: id_ for id_,tag_ in enumerate(label_names)}

In [7]:
ds_train = ds['train']
ds_val = ds['validation'].shuffle(seed=0)
ds_test = ds['test']
ds_train = concatenate_datasets([ds_train, ds_val.select(range(700))])
ds_val = ds_val.select(range(700, 832))

In [8]:
def compute_tags_statistics(ds_):
  nb_examples_per_label = {}
  for label in label_names:
    if label.startswith('B-'):
      nb_examples_per_label[label.replace('B-', '')] = ds_.filter(lambda elem: label in [label_names[tag] for tag in elem['ner_tags']]).num_rows
  ds = pd.Series(nb_examples_per_label)
  #ds = ds/ds.sum()
  return ds

In [9]:
pd.concat((compute_tags_statistics(ds_train).rename('train'),
           compute_tags_statistics(ds_val).rename('val'),
           compute_tags_statistics(ds_test).rename('test')), axis=1)

Unnamed: 0,train,val,test
LIVB,367,39,204
PROC,683,60,381
ANAT,375,33,195
DEVI,60,5,25
CHEM,348,21,191
GEOG,62,8,47
PHYS,183,22,96
PHEN,73,4,35
DISO,897,78,513
OBJC,45,2,28


# II. Modelisation

## 1. Data

### 1.1. Prepare Data

In [10]:
# domain_model = 'quinten-datalab/AliBERT-7GB'
# domain_model = 'Dr-BERT/DrBERT-7GB'
domain_model = 'numind/NuNER-multilingual-v0.1'
general_model = 'camembert/camembert-base' # only for tokenization comparison purpose

In [11]:
tokenizer = AutoTokenizer.from_pretrained(domain_model)
general_tokenizer = AutoTokenizer.from_pretrained(general_model)



In [12]:
# ds_train = ds_train.map(lambda elem: {'#tokens': len(tokenizer([tok for tok, tag in zip(elem['tokens'], elem['ner_tags']) if tag != 0],
#                                                                is_split_into_words=True)['input_ids'])})
# ds_train = ds_train.map(lambda elem: {'#tokens_general': len(general_tokenizer([tok for tok, tag in zip(elem['tokens'], elem['ner_tags']) if tag != 0],
#                                                                                is_split_into_words=True)['input_ids'])})

Domain-specific tokenizer preserve domain words (no oversplitting)

In [13]:
example = 'leucodystrophie métachromatique'
print(" | ".join(general_tokenizer.convert_ids_to_tokens(general_tokenizer(example)['input_ids'])[1:-1]))
print(" | ".join(tokenizer.convert_ids_to_tokens(tokenizer(example)['input_ids'])[1:-1]))

▁le | uc | ody | s | trophi | e | ▁méta | ch | ro | matique
le | ##uco | ##dy | ##stro | ##phie | mét | ##ach | ##roma | ##tique


In [14]:
def get_ner_spans(bio_tags_names, tokens):
    '''Remove BIO to re-construct raw ner spans
    
    >> bio_tags_names = ['B-DISO', 'B-DISO','I-DISO','B-ANAT', 'O','B-DISO','I-DISO','O','B-DISO']
    >> tokens = [str(i).zfill(3) for i,_ in enumerate(bio_tags_names)]
    >> get_ner_spans(bio_tags_names, tokens)
    
    {'ANAT': ['tok_003'],
     'DISO': ['tok_000', 'tok_001 tok_002', 'tok_005 tok_006', 'tok_008']}
    '''
    bio_tags_names.append('O')
    tokens.append('')

    ner_tags_names = [t_.replace('B-','').replace('I-','') for t_ in bio_tags_names]

    ner_spans = {}

    for _tag_name in tags_names:
        a1 = np.equal(ner_tags_names, _tag_name)
        a1_rshifted = np.roll(a1, 1)
        starts = a1 & ~a1_rshifted 
        ends = ~a1 & a1_rshifted
        for start, end in zip(np.nonzero(starts)[0], np.nonzero(ends)[0]):
            _spans = bio_tags_names[start: end]
            _tokens = tokens[start:end]
            sub_tokens = []
            sub_spans = []
            for _tag, _token in zip(_spans, _tokens):
                if _tag.startswith('B-'):
                    sub_tokens.append([_token])
                else:
                    sub_tokens[-1].append(_token)
            ner_spans.setdefault(_tag_name, []).extend([" ".join(l_) for l_ in sub_tokens])
            
    return ner_spans

def align_labels_with_tokens(labels, word_ids, ignore_subwords=False, ignore_loss_label=-100):
    new_labels = []
    spans = {}
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = ignore_loss_label if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(ignore_loss_label)
        else:
            if not ignore_subwords:
                label = labels[word_id]
                # If the label is B-XXX we change it to I-XXX
                # if label % 2 == 1:
                #     label += 1
            else:
                label = ignore_loss_label
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples, ignore_subwords=False, ignore_loss_label=-100):
    '''if ignore_subwords, then all non-begining subwords will have a label of -100. Otherwise, they will get label I-xxx
    '''
    # tokenize words    
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True,
    )
    
    # align labels with tokenizer subwords
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids, ignore_subwords, ignore_loss_label))

    tokenized_inputs["string_tokens"] = [tokenizer.convert_ids_to_tokens(elem) for elem in tokenized_inputs['input_ids']]
    tokenized_inputs["#string_tokens"] = [len(elem) for elem in tokenized_inputs['string_tokens']]
    if ignore_subwords:
      tokenized_inputs["labels_ignore_subtokens"] = new_labels
    else:
      tokenized_inputs["labels"] = new_labels
    
    # get ner spans
    ner_spans = []
    for bio_tags_names, tokens in zip(examples['ner_tags'], examples['tokens']):
        ner_spans.append(get_ner_spans([id2label[t_] for t_ in bio_tags_names], tokens))
    tokenized_inputs['ner_spans'] = ner_spans
    return tokenized_inputs




### 1.2. Compute labels statistics

In [15]:
ds_train = ds_train.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False), batched=True)
ds_val = ds_val.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False), batched=True)
ds_test = ds_test.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False), batched=True)
ds_train = ds_train.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)
ds_val = ds_val.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)
ds_test = ds_test.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)

In [16]:
(ds_train.sort(column_names=["#string_tokens"])[-1]["#string_tokens"],
ds_val.sort(column_names=["#string_tokens"])[-1]["#string_tokens"],
ds_test.sort(column_names=["#string_tokens"])[-1]["#string_tokens"])

(91, 70, 102)

In [17]:
ind_ = 9
print(ds_train[ind_]['tokens'])
print(ds_train[ind_]['ner_tags'])
print(ds_train[ind_]['string_tokens'])
print(ds_train[ind_]['labels'])
print(ds_train[ind_]['labels_ignore_subtokens'])

['Migration', 'et', 'douleur', '.', 'Nécessité', "d'", 'une', 'recherche', 'épidémiologique', '.', '', '']
[13, 0, 17, 0, 0, 0, 0, 3, 4, 0]
['[CLS]', 'Migration', 'et', 'do', '##uleur', '.', 'Né', '##ces', '##sit', '##é', 'd', "'", 'une', 'recherche', 'é', '##pid', '##ém', '##iol', '##ogique', '.', '[SEP]']
[-100, 13, 0, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, -100]
[-100, 13, 0, 17, -100, 0, 0, -100, -100, -100, 0, -100, 0, 3, 4, -100, -100, -100, -100, 0, -100]


In [18]:
def compute_bio_tags_statistics(ds_, label_column='labels'):
  label_names = ds_.features["ner_tags"].feature.names
  nb_examples_per_label = {}
  for label in label_names:
    nb_examples_per_label[label] = 0
    for elem in ds_:
      nb_examples_per_label[label] += len([tag for tag in elem[label_column] if tag != -100 and label_names[tag] == label])
  ds = pd.Series(nb_examples_per_label)
  #ds = ds/ds.sum()
  return ds

pd.concat((compute_bio_tags_statistics(ds_train, label_column='labels').rename('train'),
           compute_bio_tags_statistics(ds_val, label_column='labels').rename('val'),
           compute_bio_tags_statistics(ds_test, label_column='labels').rename('test'),
           compute_bio_tags_statistics(ds_train, label_column='labels_ignore_subtokens').rename('train (ignore subtokens)'),
           compute_bio_tags_statistics(ds_val, label_column='labels_ignore_subtokens').rename('val (ignore subtokens)'),
           compute_bio_tags_statistics(ds_test, label_column='labels_ignore_subtokens').rename('test (ignore subtokens)')), axis=1)

Unnamed: 0,train,val,test,train (ignore subtokens),val (ignore subtokens),test (ignore subtokens)
O,19393,1661,10705,13139,1084,7246
B-LIVB,1004,127,610,463,51,263
I-LIVB,312,32,130,156,13,58
B-PROC,2391,217,1394,921,77,531
I-PROC,1012,81,568,426,32,226
B-ANAT,1223,93,695,461,38,245
I-ANAT,515,45,270,185,22,107
B-DEVI,160,13,73,62,5,30
I-DEVI,122,13,45,54,5,20
B-CHEM,1791,102,942,521,29,269


## 2. Training

EVALUATION TOOLBOX

In [19]:
seqeval = evaluate.load("seqeval")

def compute_global_ner_metrics(predictions, labels):
  true_predictions = [
      [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  return results

def get_set_metrics(_predictions, _labels):
  _metrics = compute_global_ner_metrics(_predictions, _labels)
  _number = int(np.sum([_metrics[tag_name]['number'] for tag_name in _metrics if 'overall_' not in tag_name]))
  _metrics['overall'] = {}
  for overall_m in ['overall_precision', 'overall_recall', 'overall_f1']:
    _, _m = overall_m.split('_')
    _metrics["overall"][_m] = _metrics[overall_m]
  _metrics["overall"]["number"] = _number
  for overall_m in ['overall_precision', 'overall_recall', 'overall_f1'] + ['overall_accuracy']:
    del _metrics[overall_m]
  for tag_name in _metrics:
    _metrics[tag_name]['number'] = int(_metrics[tag_name]['number'])
  _metrics['balanced'] = {}
  for _m in list(_metrics['overall'].keys()):
    _metrics["balanced"][_m] = np.mean([_metrics[_t][_m] for _t in _metrics if _t not in ["overall", "balanced"]])
  return _metrics

def get_ner_dataframe_metrics(model_):
  '''Compute true NER F1 score using seqeval
  '''
  tr_predictions , tr_labels, _ = trainer.predict(ds_train_val)
  tr_predictions = np.argmax(tr_predictions, axis=2)

  te_predictions , te_labels, _ = trainer.predict(ds_test)
  te_predictions = np.argmax(te_predictions, axis=2)

  train_metrics = get_set_metrics(tr_predictions, tr_labels)
  test_metrics = get_set_metrics(te_predictions, te_labels)

  train_metrics = pd.DataFrame.from_dict(train_metrics).T
  train_metrics.columns = pd.MultiIndex.from_product([['train+val'], train_metrics.columns])

  test_metrics = pd.DataFrame.from_dict(test_metrics).T
  test_metrics.columns = pd.MultiIndex.from_product([['test'], test_metrics.columns])

  return pd.concat([train_metrics, test_metrics], axis=1)

def compute_ner_metrics(eval_preds):
    logits, labels = eval_preds
    if logits.ndim == 3:
        predictions = np.argmax(logits, axis=-1)
    else: # case model + crf
        predictions = logits.copy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[int(p)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### 2.1. Finetuning model

TRANING

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
learning_rate=1e-5
epochs=20

args = TrainingArguments(
    output_dir='/tmp',
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy='epoch',
    logging_steps = 1,
    learning_rate=learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    weight_decay=0.01,
    seed=0,
    data_seed=0,
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

In [21]:
def model_init():
  '''Mandatory to make the run reproductible
  '''
  return AutoModelForTokenClassification.from_pretrained(domain_model, num_labels=len(label_names), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_ner_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at numind/NuNER-multilingual-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at numind/NuNER-multilingual-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.4181,1.075914,0.477949,0.461843,0.469758,0.683259
2,0.9258,0.877179,0.631391,0.526264,0.574054,0.742549
3,0.7468,0.841983,0.584869,0.597621,0.591176,0.735891
4,0.6111,0.802969,0.653107,0.572844,0.610348,0.765694
5,0.504,0.826502,0.602109,0.622398,0.612086,0.762207
6,0.4255,0.863201,0.601338,0.623389,0.612165,0.761256
7,0.3502,0.889573,0.593957,0.623389,0.608317,0.752695
8,0.3011,0.929437,0.621595,0.6333,0.627393,0.768865
9,0.2481,0.945615,0.623762,0.624381,0.624071,0.767914
10,0.2188,0.995443,0.612903,0.640238,0.626272,0.762207


TrainOutput(global_step=2112, training_loss=0.5393862146319766, metrics={'train_runtime': 318.6477, 'train_samples_per_second': 96.219, 'train_steps_per_second': 12.051, 'total_flos': 392513197261014.0, 'train_loss': 0.5393862146319766, 'epoch': 11.0})

In [24]:
# trainer.save_model(f'/content/drive/finetuning_quaero_ner_{domain_model}')

EVALUATION

In [26]:
ds_train_val = concatenate_datasets((ds_train, ds_val))

In [27]:
pd.DataFrame.from_dict(trainer.evaluate(ds_test), orient='index').T

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.007,0.619,0.61,0.615,0.757,1.957,425.701,53.66,11.0


In [29]:
df_metrics = get_ner_dataframe_metrics(trainer)
df_metrics

Unnamed: 0_level_0,train+val,train+val,train+val,train+val,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1,number,precision,recall,f1,number
ANAT,0.866,0.853,0.859,1316.0,0.371,0.38,0.376,695.0
CHEM,0.894,0.94,0.917,1893.0,0.598,0.749,0.665,942.0
DEVI,0.59,0.665,0.625,173.0,0.283,0.178,0.218,73.0
DISO,0.931,0.932,0.932,4384.0,0.708,0.717,0.713,2336.0
GEOG,0.782,0.866,0.822,112.0,0.706,0.741,0.723,81.0
LIVB,0.912,0.918,0.915,1131.0,0.684,0.662,0.673,610.0
OBJC,0.6,0.124,0.205,97.0,0.75,0.044,0.083,68.0
PHEN,0.667,0.01,0.019,203.0,1.0,0.01,0.02,99.0
PHYS,0.68,0.693,0.686,625.0,0.371,0.337,0.353,332.0
PROC,0.89,0.918,0.904,2608.0,0.649,0.577,0.611,1394.0


In [30]:
# df_metrics.to_markdown(floatfmt=".3f")

### 2.2 finetuning Model + CRF

TRAINING

In [20]:
# CRF doesn't accepts labels with values -100
ds_train = ds_train.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False, ignore_loss_label=0), batched=True)
ds_val = ds_val.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False, ignore_loss_label=0), batched=True)
ds_test = ds_test.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False, ignore_loss_label=0), batched=True)
ds_train = ds_train.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)
ds_val = ds_val.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)
ds_test = ds_test.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)

In [21]:
class AutoModelCrfForTokenClassification(nn.Module):
    def __init__(self, num_labels):
        super(AutoModelCrfForTokenClassification, self).__init__()
    
        self.encoder =  AutoModel.from_pretrained(domain_model, num_labels=len(label_names), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)
    
        self.config = self.encoder.config
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
        self.crf = CRF(num_tags=num_labels, batch_first=True)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
          Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
          1]``.
        """
        outputs = self.encoder(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags)

        output = (tags,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


In [22]:
data_collator_crf = DataCollatorForTokenClassification(tokenizer=tokenizer, label_pad_token_id=0)

In [24]:
learning_rate=1e-5
epochs=20

args_crf = TrainingArguments(
    output_dir='/tmp',
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy='epoch',
    logging_steps = 1,
    learning_rate=learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    weight_decay=0.01,
    seed=0,
    data_seed=0,
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

In [25]:
def model_crf_init():
    return AutoModelCrfForTokenClassification(len(label_names))

trainer_crf = Trainer(
    model_init=model_crf_init,
    args=args_crf,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator_crf,
    tokenizer=tokenizer,
    compute_metrics=compute_ner_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)



In [30]:
trainer_crf.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,288.6824,200.243103,0.480663,0.517344,0.498329,0.842088
2,178.1168,164.017761,0.589852,0.553023,0.570844,0.859816
3,144.0702,162.735519,0.568392,0.609514,0.588235,0.862607
4,116.9928,153.440659,0.628964,0.589693,0.608696,0.877544
5,96.2169,160.576218,0.592912,0.613479,0.60302,0.874754
6,80.229,172.875412,0.614328,0.620416,0.617357,0.874754
7,67.344,177.156937,0.616667,0.623389,0.62001,0.876067
8,57.1713,177.341797,0.617476,0.630327,0.623835,0.880007
9,46.9443,182.875031,0.630285,0.635282,0.632774,0.879186
10,40.4684,196.191849,0.604303,0.640238,0.621752,0.878365


TrainOutput(global_step=2880, training_loss=83.53738216824001, metrics={'train_runtime': 591.7201, 'train_samples_per_second': 51.815, 'train_steps_per_second': 6.49, 'total_flos': 0.0, 'train_loss': 83.53738216824001, 'epoch': 15.0})

In [31]:
pd.DataFrame.from_dict(trainer_crf.evaluate(ds_test), orient='index').T

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,241.853,0.62,0.624,0.622,0.868,5.099,163.364,20.592,15.0


In [31]:
# df_metrics = get_ner_dataframe_metrics(trainer_crf) # NOT WORKING WITH MODEL CRF
# df_metrics

### 2.3 finetuning Model + BiLSTM + CRF

TRAINING

In [28]:
# CRF doesn't accepts labels with values -100
ds_train = ds_train.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False, ignore_loss_label=0), batched=True)
ds_val = ds_val.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False, ignore_loss_label=0), batched=True)
ds_test = ds_test.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=False, ignore_loss_label=0), batched=True)
ds_train = ds_train.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)
ds_val = ds_val.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)
ds_test = ds_test.map(lambda elem: tokenize_and_align_labels(elem, ignore_subwords=True), batched=True)

Map: 100%|██████████| 132/132 [00:00<00:00, 1101.36 examples/s]
Map: 100%|██████████| 132/132 [00:00<00:00, 1588.78 examples/s]


In [58]:
class AutoModelBiLstmCrfForTokenClassification(nn.Module):
    def __init__(self, num_labels):
        super(AutoModelBiLstmCrfForTokenClassification, self).__init__()
    
        self.encoder =  AutoModel.from_pretrained(domain_model, num_labels=len(label_names), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)
    
        self.config = self.encoder.config
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, dropout=self.config.hidden_dropout_prob, batch_first=True,
                          bidirectional=True)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
        self.crf = CRF(num_tags=num_labels, batch_first=True)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
  ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        outputs = self.encoder(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        logits = self.classifier(lstm_output)

        loss = None
        if labels is not None:
            log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags)

        output = (tags,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


In [59]:
data_collator_bilstm_crf = DataCollatorForTokenClassification(tokenizer=tokenizer, label_pad_token_id=0)

In [60]:
learning_rate=2e-5
epochs=20

args_bilstm_crf = TrainingArguments(
    output_dir='/tmp',
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy='epoch',
    logging_steps = 1,
    learning_rate=learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    weight_decay=0.01,
    seed=0,
    data_seed=0,
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True
)

In [61]:
def model_bilstm_crf_init():
    return AutoModelBiLstmCrfForTokenClassification(len(label_names))

trainer_bilstm_crf = Trainer(
    model_init=model_bilstm_crf_init,
    args=args_bilstm_crf,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator_bilstm_crf,
    tokenizer=tokenizer,
    compute_metrics=compute_ner_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)



In [62]:
trainer_bilstm_crf.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,362.5781,267.435913,0.309091,0.252725,0.278081,0.793336
2,229.8138,201.198227,0.457753,0.488603,0.472675,0.839954
3,172.3894,187.655716,0.509305,0.515362,0.512315,0.852265
4,136.5337,168.650467,0.598712,0.553023,0.574961,0.869009
5,107.5308,188.087982,0.548723,0.574827,0.561471,0.863592
6,83.5902,181.787582,0.577423,0.572844,0.575124,0.866546
7,67.2535,186.208298,0.573614,0.594648,0.583942,0.870158
8,55.0789,198.313004,0.571568,0.581764,0.576621,0.867859
9,44.2126,202.399261,0.593598,0.606541,0.6,0.87065
10,37.8942,212.305817,0.598425,0.602577,0.600494,0.872456


TrainOutput(global_step=3072, training_loss=89.09236637751262, metrics={'train_runtime': 649.1458, 'train_samples_per_second': 47.231, 'train_steps_per_second': 5.915, 'total_flos': 0.0, 'train_loss': 89.09236637751262, 'epoch': 16.0})

In [63]:
pd.DataFrame.from_dict(trainer_bilstm_crf.evaluate(ds_test), orient='index').T

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,257.13,0.63,0.622,0.626,0.87,5.203,160.096,20.18,16.0


In [32]:
# df_metrics = get_ner_dataframe_metrics(trainer_crf) # NOT WORKING WITH MODEL CRF
# df_metrics

### Analyze Erros (PHEN and OBJC)

In [30]:
ds_test_low_f1_score = ds_test.filter(lambda elem: elem['ner_spans']['PHEN'] or elem['ner_spans']['OBJC'])

In [31]:
ds_train_low_f1_score = ds_train.filter(lambda elem: elem['ner_spans']['PHEN'] or elem['ner_spans']['OBJC'])

In [32]:
tagger = pipeline(task='ner', aggregation_strategy='first', model=trainer.model, tokenizer=tokenizer)

test_example_index = 5

text = " ".join(ds_test_low_f1_score[test_example_index]['tokens'])


predictions = tagger(text)
for pred in predictions:
    pred['type'] = 'pred'
    del pred["start"]
    del pred["end"]
    del pred["score"]

for entity_group, entity_values in ds_test_low_f1_score[test_example_index]['ner_spans'].items():
    if entity_values:
        for entity_value in entity_values:
            predictions.append({"entity_group": entity_group,
                                "word": entity_value,
                                "type": "ground truth"})
                            


print('text: ' + text)
pd.DataFrame(predictions)
# pd.DataFrame(predictions).to_markdown()

text: A propos de l' évolution et de la situation épidémiologique actuelle de la lèpre à la Guadeloupe : analyse des données du fichier central du département .  


Unnamed: 0,entity_group,word,type
0,DISO,lèpre,pred
1,GEOG,Guadeloupe,pred
2,DISO,lèpre,ground truth
3,GEOG,Guadeloupe,ground truth
4,PHEN,épidémiologique,ground truth
5,PROC,analyse,ground truth


### Model Generation with Few-Shot prompting

#### Load model embedder

In [33]:
# load model used to compute embeddings of train text

encoder_model = 'Dr-BERT/DrBERT-7GB'
model = transformers.AutoModel.from_pretrained(
    encoder_model,
    output_hidden_states=True,
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(encoder_model)

Some weights of CamembertModel were not initialized from the model checkpoint at Dr-BERT/DrBERT-7GB and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 1. Encode all training examples to get text embeddings as vectors

In [34]:
def get_embeddings(examples):
    '''Compute text embeddings to later build an indexer for retrieval purpose
    '''
    encoded_input = tokenizer(
        " ".join(examples['tokens']),
        return_tensors='pt',
        padding=True,
        truncation=True
    )
    output = model(**encoded_input)

    # Get the CLS token embedding
    emb = output.hidden_states[-1][0][0]
    
    examples['embeddings'] = emb
    return examples

In [35]:
ds_train = ds_train.map(get_embeddings)
ds_train_low_f1_score = ds_train_low_f1_score.map(get_embeddings)

Map: 100%|██████████| 1533/1533 [01:21<00:00, 18.79 examples/s]
Map: 100%|██████████| 113/113 [00:05<00:00, 19.69 examples/s]


#### 2. Create an index using all training vectors

In [36]:
# Compute indexer for easy and fast retrieving of closest examples
ds_train = ds_train.add_faiss_index(column='embeddings')
ds_train_low_f1_score = ds_train_low_f1_score.add_faiss_index(column='embeddings')

100%|██████████| 2/2 [00:00<00:00, 123.19it/s]
100%|██████████| 1/1 [00:00<00:00, 999.12it/s]


#### 3. Use a text from tesing set and query the indexer to get the K (K=5) closest training examples

In [37]:
# Give a try to indexer
text = " ".join(ds_test_low_f1_score[test_example_index]["tokens"])
query_text_embeddings = get_embeddings(ds_test[test_example_index])['embeddings'].detach().numpy()
# query_text_embeddings = get_embeddings({"tokens":'peut-on utiliser des tests sérologiques pour détecter le VIH?'.split()})['embeddings'].detach().numpy()
scores, retrieved_examples = ds_train_low_f1_score.get_nearest_examples('embeddings', query_text_embeddings, k=5)

In [38]:
# displated similar training examples
print("QUERY:")
print(text)
print('\nSIMILAR EXAMPLES:')
for ex in retrieved_examples["tokens"]:
    print(" ".join(ex))

QUERY:
A propos de l' évolution et de la situation épidémiologique actuelle de la lèpre à la Guadeloupe : analyse des données du fichier central du département .  

SIMILAR EXAMPLES:
Purification de la réductase déhydroascorbique du chou-fleur ( Brassica oleracea L .)  
Résultat des essais de provocation bronchodynamiques réalisés avec la bradykinine et la sérotonine  
La synchronisation de l' oestrus chez les ruminants  
Examen electromyographique du muscle strie chez le cobaye apres irradation au Co - 60 .  
Modifications du spectre d' amplitudes de l' électroencéphalogramme provoquées par l' activité mentale et détectées par le miniordinateur .  


#### 4. Format the prompt with the K training examples and their respective labels + the output schema expected + the testing text

In [39]:
# Define output schema
schema = """{
    "Anatomy": [],
    "Chemical and Drugs": [],
    "Devices": [],
    "Disorders": [],
    "Geographic Areas": [],
    "Living Beings": [],
    "Objects": [],
    "Phenomena": [],
    "Physiology": [],
    "Procedures": []
}"""

In [40]:
# Formate examples
names_mapping = {'ANAT': "Anatomy",
 'CHEM': "Chemical and Drugs",
 'DEVI': "Devices",
 'DISO': "Disorders",
 'GEOG': "Geographic Areas",
 'LIVB': "Living Beings",
 'OBJC': "Objects",
 'PHEN': "Phenomena",
 'PHYS': "Physiology",
 'PROC': "Procedures"
}

formatted_examples = []
for tokens, ner_spans in zip(retrieved_examples['tokens'], retrieved_examples['ner_spans']):
    formatted_example = {}
    formatted_example['Text'] = " ".join(tokens)
    formatted_example['Schema'] = {}
    for entity_group, entity_values in ner_spans.items():
        if entity_values:
            formatted_example['Schema'][names_mapping[entity_group]] = entity_values
        else:
            formatted_example['Schema'][names_mapping[entity_group]] = []
    formatted_examples.append(json.dumps(formatted_example, indent=4, ensure_ascii=False))

#### 5. Execute Generative Model with previous prompt as input

[link](https://huggingface.co/numind/NuExtract)

In [41]:
# load generative model
gen_model = AutoModelForCausalLM.from_pretrained("numind/NuExtract", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract", trust_remote_code=True)

gen_model.to("cuda")

gen_model.eval()

`flash-attention` package not found, consider installing for better performance: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN2at4_ops5zeros4callEN3c108ArrayRefINS2_6SymIntEEENS2_8optionalINS2_10ScalarTypeEEENS6_INS2_6LayoutEEENS6_INS2_6DeviceEEENS6_IbEE.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.42s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

In [42]:
# define generation ner function
def predict_NuExtract(gen_model, tokenizer, text, schema, example=["", "", ""]):
    
    schema = json.dumps(json.loads(schema), indent=4, ensure_ascii=False)
    input_llm =  "<|input|>\n### Template:\n" +  schema + "\n"
    for i in example:
      if i != "":
          input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4, ensure_ascii=False)+"\n"
    
    input_llm +=  "### Text:\n"+text +"\n<|output|>\n"
    #print(input_llm)
    input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length=4000).to("cuda")

    output = tokenizer.decode(gen_model.generate(**input_ids)[0], skip_special_tokens=True)
    return output.split("<|output|>")[1].split("<|end-output|>")[0]

In [43]:
# launch using test text and formatted_examples
output = predict_NuExtract(gen_model, tokenizer, text, schema, example=formatted_examples)

You are not running the flash-attention implementation, expect numerical differences.


#### 6. get a formatted output of labels found inside the query

In [44]:
print(f'query: {text}\n')
print(json.dumps(json.loads(output), indent=4, ensure_ascii=False))

query: A propos de l' évolution et de la situation épidémiologique actuelle de la lèpre à la Guadeloupe : analyse des données du fichier central du département .  

{
    "Anatomy": [],
    "Chemical and Drugs": [],
    "Devices": [],
    "Disorders": [
        "lépre"
    ],
    "Geographic Areas": [
        "Guadeloupe"
    ],
    "Living Beings": [],
    "Objects": [],
    "Phenomena": [],
    "Physiology": [],
    "Procedures": []
}


In [47]:
for ex_ in formatted_examples:
    print(ex_)

{
    "Text": "Purification de la réductase déhydroascorbique du chou-fleur ( Brassica oleracea L .)  ",
    "Schema": {
        "Anatomy": [],
        "Chemical and Drugs": [],
        "Devices": [],
        "Disorders": [],
        "Geographic Areas": [],
        "Living Beings": [
            "Brassica oleracea"
        ],
        "Objects": [
            "chou-fleur"
        ],
        "Phenomena": [],
        "Physiology": [
            "réductase déhydroascorbique"
        ],
        "Procedures": []
    }
}
{
    "Text": "Résultat des essais de provocation bronchodynamiques réalisés avec la bradykinine et la sérotonine  ",
    "Schema": {
        "Anatomy": [],
        "Chemical and Drugs": [
            "bradykinine",
            "sérotonine"
        ],
        "Devices": [],
        "Disorders": [],
        "Geographic Areas": [],
        "Living Beings": [],
        "Objects": [],
        "Phenomena": [
            "provocation"
        ],
        "Physiology": [],
        "P

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 5.83k/5.83k [00:00<00:00, 26.9MB/s]
Downloading readme: 100%|██████████| 9.70k/9.70k [00:00<00:00, 38.3MB/s]
Downloading data: 1.14MB [00:00, 23.1MB/s]                  
Downloading data: 200kB [00:00, 9.02MB/s]                    
Downloading data: 206kB [00:00, 10.1MB/s]                    
Generating train split: 100%|██████████| 5433/5433 [00:00<00:00, 6273.32 examples/s]
Generating validation split: 100%|██████████| 924/924 [00:00<00:00, 5914.73 examples/s]
Generating test split: 100%|██████████| 941/941 [00:00<00:00, 5875.01 examples/s]
