In [15]:
import logging
import os
import sys
import json

import datasets
import numpy as np
from datasets import ClassLabel, load_dataset, load_metric

import transformers
from transformers import (
    AutoConfig,
    DistilBertConfig,
    AutoModelForTokenClassification,
    DistilBertForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    HfArgumentParser,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
    set_seed,
)



In [2]:
# load preprocessed data
raw_datasets = datasets.load_from_disk('chifir_hf/')


In [3]:
# get ner info
column_names = ['id', 'tokens', 'ner_tags']
text_column_name = "tokens"
label_column_name = "ner_tags"

padding = False
label_all_tokens = False
max_seq_length = 128


features = raw_datasets["train"].features
label_list = features[label_column_name].feature.names
# No need to convert the labels since they are already ints.
label_to_id = {i: i for i in range(len(label_list))}
num_labels = len(label_list)

# Map that sends B-Xxx label to its I-Xxx counterpart
b_to_i_label = []
for idx, label in enumerate(label_list):
    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
    else:
        b_to_i_label.append(idx)

In [4]:

def prepare_datasets(raw_datasets):

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            max_length=max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    if label_all_tokens:
                        label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
                    else:
                        label_ids.append(-100)
                previous_word_idx = word_idx
    
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    train_dataset = raw_datasets["train"]
    eval_dataset = raw_datasets["validation"]
    test_dataset = raw_datasets["test"]
    
    
    train_dataset = train_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=1
    )
    
    eval_dataset = eval_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=1
    )
    
    test_dataset = test_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=1
    )

    return train_dataset, eval_dataset, test_dataset

In [5]:
metric = load_metric("seqeval")

# Evaluation
return_entity_level_metrics=True

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


  metric = load_metric("seqeval")


In [6]:
def train_and_validate(tokenizer, model, training_args, train_dataset, eval_dataset):

    data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=None)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    train_result = trainer.train(resume_from_checkpoint=None)
    metrics = train_result.metrics

    # val perf
    eval_metrics = trainer.evaluate()

    return eval_metrics, trainer


def tune_hparam_over_val_set(tokenizer, model, train_dataset, eval_dataset, metric='overall_f1'):

    scores, hparam = [], []

    best_score = 0
    best_hparam, best_eval_metrics = None, None

    epochs = [3, 5, 10]
    batch_sizes = [32]
    lrs = [2e-5, 3e-5, 5e-5]

    # # for debug
    # epochs = [1]
    # batch_sizes = [32]
    # lrs = [2e-5]
    
    for epoch in epochs:
        for batch_size in batch_sizes:
            for lr in lrs:
                training_args = TrainingArguments(
                    per_device_train_batch_size=batch_size, 
                    learning_rate=lr, 
                    num_train_epochs=epoch,
                    warmup_ratio=0.1, 
                    output_dir='tmp'
                )

                eval_metrics, _ = train_and_validate(tokenizer, model, training_args, train_dataset, eval_dataset)
                    
                score = eval_metrics[f'eval_{metric}']

                if score > best_score:
                    best_score = score
                    best_hparam = [epoch, batch_size, lr]
                    best_eval_metrics = eval_metrics

    return best_eval_metrics, best_hparam

    

In [7]:
def evaluate_model(tokenizer, model, raw_datasets):
    # everything in one go
    
    train_dataset, eval_dataset, test_dataset = prepare_datasets(raw_datasets)

    best_eval_metrics, best_hparam = tune_hparam_over_val_set(
        tokenizer, model, train_dataset, eval_dataset
    )

    epoch, batch_size, lr = best_hparam

    # retrain w/ best hparam using all train data
    training_args = TrainingArguments(
        per_device_train_batch_size=batch_size, 
        learning_rate=lr, 
        num_train_epochs=epoch,
        warmup_ratio=0.1, 
        output_dir='tmp'
    )

    full_train_dataset = datasets.concatenate_datasets([train_dataset, eval_dataset])
    _, trainer = train_and_validate(tokenizer, model, training_args, full_train_dataset, eval_dataset)

    # test 
    predictions, labels, metrics = trainer.predict(test_dataset, metric_key_prefix="test")

    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return metrics, true_predictions


In [8]:
# load BERT model

def load_bert(model_name, cache_dir='cache'):

    config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_labels,
        label2id=label_to_id,
        id2label={i: l for l, i in label_to_id.items()}
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_name, config=config, cache_dir=cache_dir)

    return tokenizer, model

## BERT

In [9]:
# bert base

model_name='bert-base-uncased'

tokenizer, model = load_bert(model_name)

test_metrics, pred = evaluate_model(tokenizer, model, raw_datasets)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss


Step,Training Loss


Step,Training Loss
500,0.0031


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0025


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0028


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.001
1000,0.0006


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0015
1000,0.0008


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0029
1000,0.0006


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0054


In [10]:
print('overall f1:', test_metrics['test_overall_f1'])
test_metrics

overall f1: 0.8235294117647058


{'test_loss': 0.03779539838433266,
 'test_ClinicalQuery_precision': 0.25,
 'test_ClinicalQuery_recall': 0.14285714285714285,
 'test_ClinicalQuery_f1': 0.18181818181818182,
 'test_ClinicalQuery_number': 7,
 'test_FungalDescriptor_precision': 0.88,
 'test_FungalDescriptor_recall': 0.88,
 'test_FungalDescriptor_f1': 0.88,
 'test_FungalDescriptor_number': 50,
 'test_Fungus_precision': 1.0,
 'test_Fungus_recall': 1.0,
 'test_Fungus_f1': 1.0,
 'test_Fungus_number': 18,
 'test_Invasiveness_precision': 1.0,
 'test_Invasiveness_recall': 0.3333333333333333,
 'test_Invasiveness_f1': 0.5,
 'test_Invasiveness_number': 6,
 'test_SampleType_precision': 0.6388888888888888,
 'test_SampleType_recall': 0.5348837209302325,
 'test_SampleType_f1': 0.5822784810126581,
 'test_SampleType_number': 43,
 'test_Stain_precision': 0.9705882352941176,
 'test_Stain_recall': 0.9705882352941176,
 'test_Stain_f1': 0.9705882352941176,
 'test_Stain_number': 34,
 'test_negative_precision': 0.9230769230769231,
 'test_negativ

## ClinicalBERT

In [11]:
# ClinicalBERT

model_name='emilyalsentzer/Bio_ClinicalBERT'

tokenizer, model = load_bert(model_name)

test_metrics, pred = evaluate_model(tokenizer, model, raw_datasets)

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3346 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss


Step,Training Loss


Step,Training Loss
500,0.0029


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0019


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0027


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0009
1000,0.0009


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0012
1000,0.0004


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0023
1000,0.0007


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0055
1000,0.001


In [12]:
print('overall f1:', test_metrics['test_overall_f1'])
test_metrics

overall f1: 0.8238341968911918


{'test_loss': 0.044489480555057526,
 'test_ClinicalQuery_precision': 0.6666666666666666,
 'test_ClinicalQuery_recall': 0.2857142857142857,
 'test_ClinicalQuery_f1': 0.4,
 'test_ClinicalQuery_number': 7,
 'test_FungalDescriptor_precision': 0.8653846153846154,
 'test_FungalDescriptor_recall': 0.9,
 'test_FungalDescriptor_f1': 0.8823529411764707,
 'test_FungalDescriptor_number': 50,
 'test_Fungus_precision': 0.9473684210526315,
 'test_Fungus_recall': 1.0,
 'test_Fungus_f1': 0.972972972972973,
 'test_Fungus_number': 18,
 'test_Invasiveness_precision': 1.0,
 'test_Invasiveness_recall': 0.3333333333333333,
 'test_Invasiveness_f1': 0.5,
 'test_Invasiveness_number': 6,
 'test_SampleType_precision': 0.6666666666666666,
 'test_SampleType_recall': 0.46511627906976744,
 'test_SampleType_f1': 0.547945205479452,
 'test_SampleType_number': 43,
 'test_Stain_precision': 0.9705882352941176,
 'test_Stain_recall': 0.9705882352941176,
 'test_Stain_f1': 0.9705882352941176,
 'test_Stain_number': 34,
 'test_n

## PubMedBERT

In [13]:
# PubMedBERT

model_name='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'

tokenizer, model = load_bert(model_name)

test_metrics, pred = evaluate_model(tokenizer, model, raw_datasets)

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3346 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss


Step,Training Loss


Step,Training Loss
500,0.0096


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0044


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0046


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0011
1000,0.0011


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.002
1000,0.0006


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0035
1000,0.0011


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss
500,0.0059
1000,0.0012


In [14]:
print('overall f1:', test_metrics['test_overall_f1'])
test_metrics

overall f1: 0.8391959798994976


{'test_loss': 0.03437098115682602,
 'test_ClinicalQuery_precision': 0.375,
 'test_ClinicalQuery_recall': 0.42857142857142855,
 'test_ClinicalQuery_f1': 0.39999999999999997,
 'test_ClinicalQuery_number': 7,
 'test_FungalDescriptor_precision': 0.9574468085106383,
 'test_FungalDescriptor_recall': 0.9,
 'test_FungalDescriptor_f1': 0.9278350515463918,
 'test_FungalDescriptor_number': 50,
 'test_Fungus_precision': 0.9473684210526315,
 'test_Fungus_recall': 1.0,
 'test_Fungus_f1': 0.972972972972973,
 'test_Fungus_number': 18,
 'test_Invasiveness_precision': 1.0,
 'test_Invasiveness_recall': 0.5,
 'test_Invasiveness_f1': 0.6666666666666666,
 'test_Invasiveness_number': 6,
 'test_SampleType_precision': 0.6111111111111112,
 'test_SampleType_recall': 0.5116279069767442,
 'test_SampleType_f1': 0.5569620253164557,
 'test_SampleType_number': 43,
 'test_Stain_precision': 0.9705882352941176,
 'test_Stain_recall': 0.9705882352941176,
 'test_Stain_f1': 0.9705882352941176,
 'test_Stain_number': 34,
 'tes

In [19]:
with open("test_results.json", "w") as out:
    json.dump(test_metrics, out, indent=4)

with open("test_predictions.txt", "w") as writer:
    for p in pred:
        writer.write(" ".join(p) + "\n")