# compare different BERT models for the tissue and cell type NER task
Fine tune existing bioNER models for biomedical tissue and cell type prediction

In [1]:
from datasets import Dataset, Features, Sequence, Value, ClassLabel
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import precision_recall_fscore_support
from iob_functions import *

import random

random.seed(6002)

In [2]:
BASE_DIR = "../data/"

training_f = process_tab_delim_iob(BASE_DIR + 'tags/fulltext_iob/fulltext_tissues_train.iob')
training_a = process_tab_delim_iob(BASE_DIR + 'tags/abstract_iob/abstract_tissues_train.iob')
training = {'sentences': training_f['sentences'] + training_a['sentences'], 'tags': training_f['tags'] + training_a['tags']}

valid_f = process_tab_delim_iob(BASE_DIR + 'tags/fulltext_iob/fulltext_tissues_validation.iob')
valid_a = process_tab_delim_iob(BASE_DIR + 'tags/abstract_iob/abstract_tissues_validation.iob')
validation = {'sentences': valid_f['sentences'] + valid_a['sentences'], 'tags': valid_f['tags'] + valid_a['tags']}

test_f = process_tab_delim_iob(BASE_DIR + 'tags/fulltext_iob/fulltext_tissues_test.iob')
test_a = process_tab_delim_iob(BASE_DIR + 'tags/abstract_iob/abstract_tissues_test.iob')
test = {'sentences': test_f['sentences'] + test_a['sentences'], 'tags': test_f['tags'] + test_a['tags']}

In [3]:
features = Features({"tokens": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
                     "tags": Sequence(feature=ClassLabel(names=["O", "B-CELL_TYPE", "I-CELL_TYPE", "B-TISSUE", "I-TISSUE"]))})

In [4]:
training_ds = Dataset.from_dict({"tokens": training['sentences'], "tags": training['tags']}, features=features)
validation_ds = Dataset.from_dict({"tokens": validation['sentences'], "tags": validation['tags']}, features=features)
test_ds = Dataset.from_dict({"tokens": test['sentences'], "tags": test['tags']}, features=features)

In [5]:
all_tags = training_ds.features["tags"].feature
tag_list = training_ds.features["tags"].feature.names
id2tag = {idx: tag for idx, tag in enumerate(all_tags.names)}
tag2id = {tag: idx for idx, tag in enumerate(all_tags.names)}

In [6]:
# get the list of models to run
m_names = ['bert-base-uncased', 'google/electra-base-discriminator',
           'dmis-lab/biobert-base-cased-v1.2', 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12',
           'kamalkraj/bioelectra-base-discriminator-pubmed',
           'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract']
MAX_LENGTH = 256

In [7]:
# from https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(data, tknzr, max_length=50):
    tokenized_inputs = tknzr(data['tokens'], truncation=True, is_split_into_words=True, max_length=max_length)

    labels = []
    for i, label in enumerate(data['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
seqeval = evaluate.load("seqeval")

def flatten(l):
    return [item for sublist in l for item in sublist]
    
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tag_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tag_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    f1res = precision_recall_fscore_support(flatten(true_labels), flatten(true_predictions), labels=all_tags.names)

    df = list(zip(all_tags.names, f1res[2], f1res[0], f1res[1]))
    df = pd.DataFrame(df, columns = ['Level', 'F1-Score', 'Precision', 'Recall'])   
    print(df)

    return results

In [9]:
test_metrics = dict()

for m in m_names:
    print(m)
    tokenizer = AutoTokenizer.from_pretrained(m)
    # dynamically pad sentences to longest length in batch for efficiency
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    train_tokenized = training_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})
    val_tokenized = validation_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})
    test_tokenized = test_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})

    model = AutoModelForTokenClassification.from_pretrained(
        m, num_labels=5, id2label=id2tag, label2id=tag2id
    )

    training_args = TrainingArguments(
        output_dir="model/" + m,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    test_preds = trainer.predict(test_tokenized)
    test_metrics[m] = test_preds.metrics


bert-base-uncased


Map:   0%|          | 0/20596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3499 [00:00<?, ? examples/s]

Map:   0%|          | 0/5753 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Cell Type,Tissue,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0169,0.011733,"{'precision': 0.6870026525198939, 'recall': 0.7617647058823529, 'f1': 0.7224546722454672, 'number': 340}","{'precision': 0.7286324786324786, 'recall': 0.8197115384615384, 'f1': 0.7714932126696833, 'number': 416}",0.710059,0.793651,0.749532,0.995312
2,0.0101,0.011885,"{'precision': 0.723463687150838, 'recall': 0.7617647058823529, 'f1': 0.7421203438395416, 'number': 340}","{'precision': 0.8055555555555556, 'recall': 0.8365384615384616, 'f1': 0.8207547169811321, 'number': 416}",0.768354,0.80291,0.785252,0.996181
3,0.005,0.012458,"{'precision': 0.7378917378917379, 'recall': 0.7617647058823529, 'f1': 0.7496382054992764, 'number': 340}","{'precision': 0.8211764705882353, 'recall': 0.8389423076923077, 'f1': 0.8299643281807373, 'number': 416}",0.783505,0.804233,0.793734,0.99634


         Level  F1-Score  Precision    Recall
0            O  0.998034   0.998535  0.997532
1  B-CELL_TYPE  0.767164   0.769461  0.764881
2  I-CELL_TYPE  0.807095   0.764706  0.854460
3     B-TISSUE  0.814469   0.789593  0.840964
4     I-TISSUE  0.772947   0.672269  0.909091




         Level  F1-Score  Precision    Recall
0            O  0.998431   0.998475  0.998388
1  B-CELL_TYPE  0.790087   0.774286  0.806548
2  I-CELL_TYPE  0.844125   0.862745  0.826291
3     B-TISSUE  0.846246   0.837264  0.855422
4     I-TISSUE  0.804734   0.839506  0.772727




         Level  F1-Score  Precision    Recall
0            O  0.998487   0.998364  0.998611
1  B-CELL_TYPE  0.795888   0.785507  0.806548
2  I-CELL_TYPE  0.837905   0.893617  0.788732
3     B-TISSUE  0.859206   0.858173  0.860241
4     I-TISSUE  0.807018   0.831325  0.784091




         Level  F1-Score  Precision    Recall
0            O  0.996751   0.998090  0.995414
1  B-CELL_TYPE  0.824847   0.776854  0.879161
2  I-CELL_TYPE  0.860260   0.820614  0.903930
3     B-TISSUE  0.860465   0.850150  0.871034
4     I-TISSUE  0.636842   0.550000  0.756250
google/electra-base-discriminator


Map:   0%|          | 0/20596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3499 [00:00<?, ? examples/s]

Map:   0%|          | 0/5753 [00:00<?, ? examples/s]

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Cell Type,Tissue,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0173,0.011833,"{'precision': 0.7254335260115607, 'recall': 0.7382352941176471, 'f1': 0.7317784256559767, 'number': 340}","{'precision': 0.7349896480331263, 'recall': 0.8533653846153846, 'f1': 0.789766407119021, 'number': 416}",0.731001,0.801587,0.764669,0.995397
2,0.011,0.012211,"{'precision': 0.6971279373368147, 'recall': 0.7852941176470588, 'f1': 0.7385892116182573, 'number': 340}","{'precision': 0.822429906542056, 'recall': 0.8461538461538461, 'f1': 0.8341232227488151, 'number': 416}",0.763255,0.818783,0.790045,0.996058
3,0.0065,0.011799,"{'precision': 0.6939313984168866, 'recall': 0.7735294117647059, 'f1': 0.7315716272600835, 'number': 340}","{'precision': 0.8136363636363636, 'recall': 0.8605769230769231, 'f1': 0.836448598130841, 'number': 416}",0.758242,0.821429,0.788571,0.996009


         Level  F1-Score  Precision    Recall
0            O  0.998059   0.998412  0.997706
1  B-CELL_TYPE  0.744395   0.747748  0.741071
2  I-CELL_TYPE  0.813559   0.840000  0.788732
3     B-TISSUE  0.826577   0.775899  0.884337
4     I-TISSUE  0.806283   0.747573  0.875000




         Level  F1-Score  Precision    Recall
0            O  0.998350   0.998610  0.998090
1  B-CELL_TYPE  0.771186   0.733871  0.812500
2  I-CELL_TYPE  0.838710   0.823529  0.854460
3     B-TISSUE  0.850356   0.838407  0.862651
4     I-TISSUE  0.876543   0.959459  0.806818




         Level  F1-Score  Precision    Recall
0            O  0.998338   0.998573  0.998103
1  B-CELL_TYPE  0.759207   0.724324  0.797619
2  I-CELL_TYPE  0.834146   0.868020  0.802817
3     B-TISSUE  0.857814   0.837156  0.879518
4     I-TISSUE  0.857143   0.862069  0.852273




         Level  F1-Score  Precision    Recall
0            O  0.997033   0.997600  0.996468
1  B-CELL_TYPE  0.825694   0.793725  0.860347
2  I-CELL_TYPE  0.875682   0.874728  0.876638
3     B-TISSUE  0.868902   0.862765  0.875128
4     I-TISSUE  0.700000   0.661111  0.743750
dmis-lab/biobert-base-cased-v1.2


Map:   0%|          | 0/20596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3499 [00:00<?, ? examples/s]

Map:   0%|          | 0/5753 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Cell Type,Tissue,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0159,0.010158,"{'precision': 0.7450424929178471, 'recall': 0.775811209439528, 'f1': 0.7601156069364162, 'number': 339}","{'precision': 0.7591397849462366, 'recall': 0.8485576923076923, 'f1': 0.8013620885357549, 'number': 416}",0.753056,0.815894,0.783217,0.996081
2,0.0092,0.011037,"{'precision': 0.6649616368286445, 'recall': 0.7669616519174042, 'f1': 0.7123287671232877, 'number': 339}","{'precision': 0.8087557603686636, 'recall': 0.84375, 'f1': 0.8258823529411765, 'number': 416}",0.740606,0.809272,0.773418,0.995922
3,0.0054,0.011515,"{'precision': 0.6857142857142857, 'recall': 0.7787610619469026, 'f1': 0.729281767955801, 'number': 339}","{'precision': 0.8175519630484989, 'recall': 0.8509615384615384, 'f1': 0.8339222614840989, 'number': 416}",0.755501,0.818543,0.78576,0.996253


         Level  F1-Score  Precision    Recall
0            O  0.998411   0.998746  0.998077
1  B-CELL_TYPE  0.792846   0.791667  0.794030
2  I-CELL_TYPE  0.845411   0.870647  0.821596
3     B-TISSUE  0.840779   0.801310  0.884337
4     I-TISSUE  0.787879   0.709091  0.886364




         Level  F1-Score  Precision    Recall
0            O  0.998331   0.998585  0.998077
1  B-CELL_TYPE  0.743590   0.711172  0.779104
2  I-CELL_TYPE  0.836879   0.842857  0.830986
3     B-TISSUE  0.852768   0.834101  0.872289
4     I-TISSUE  0.863905   0.901235  0.829545




         Level  F1-Score  Precision    Recall
0            O  0.998499   0.998722  0.998276
1  B-CELL_TYPE  0.765957   0.729730  0.805970
2  I-CELL_TYPE  0.845411   0.870647  0.821596
3     B-TISSUE  0.865248   0.849188  0.881928
4     I-TISSUE  0.843931   0.858824  0.829545




         Level  F1-Score  Precision    Recall
0            O  0.997079   0.998435  0.995727
1  B-CELL_TYPE  0.836401   0.790593  0.887844
2  I-CELL_TYPE  0.864453   0.862106  0.866812
3     B-TISSUE  0.868255   0.826852  0.914023
4     I-TISSUE  0.730864   0.604082  0.925000
bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12


Map:   0%|          | 0/20596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3499 [00:00<?, ? examples/s]

Map:   0%|          | 0/5753 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Cell Type,Tissue,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0145,0.010856,"{'precision': 0.6486486486486487, 'recall': 0.7764705882352941, 'f1': 0.7068273092369479, 'number': 340}","{'precision': 0.7680525164113785, 'recall': 0.84375, 'f1': 0.8041237113402061, 'number': 416}",0.711806,0.813492,0.759259,0.99541
2,0.0088,0.011042,"{'precision': 0.6537530266343826, 'recall': 0.7941176470588235, 'f1': 0.7171314741035856, 'number': 340}","{'precision': 0.8167053364269141, 'recall': 0.8461538461538461, 'f1': 0.8311688311688311, 'number': 416}",0.736967,0.822751,0.7775,0.995961
3,0.0045,0.011515,"{'precision': 0.7150684931506849, 'recall': 0.7676470588235295, 'f1': 0.7404255319148936, 'number': 340}","{'precision': 0.8177676537585421, 'recall': 0.8629807692307693, 'f1': 0.8397660818713449, 'number': 416}",0.771144,0.820106,0.794872,0.996413


         Level  F1-Score  Precision    Recall
0            O  0.998126   0.999068  0.997185
1  B-CELL_TYPE  0.755304   0.719677  0.794643
2  I-CELL_TYPE  0.788501   0.700730  0.901408
3     B-TISSUE  0.846154   0.819413  0.874699
4     I-TISSUE  0.803922   0.706897  0.931818




         Level  F1-Score  Precision    Recall
0            O  0.998350   0.998908  0.997793
1  B-CELL_TYPE  0.762689   0.707379  0.827381
2  I-CELL_TYPE  0.815145   0.775424  0.859155
3     B-TISSUE  0.859524   0.849412  0.869880
4     I-TISSUE  0.886364   0.886364  0.886364




         Level  F1-Score  Precision    Recall
0            O  0.998567   0.998759  0.998376
1  B-CELL_TYPE  0.779562   0.765043  0.794643
2  I-CELL_TYPE  0.838095   0.850242  0.826291
3     B-TISSUE  0.867612   0.851508  0.884337
4     I-TISSUE  0.869565   0.833333  0.909091




         Level  F1-Score  Precision    Recall
0            O  0.996756   0.998912  0.994608
1  B-CELL_TYPE  0.813526   0.744591  0.896527
2  I-CELL_TYPE  0.866094   0.806209  0.935590
3     B-TISSUE  0.869608   0.843269  0.897646
4     I-TISSUE  0.725926   0.600000  0.918750
kamalkraj/bioelectra-base-discriminator-pubmed


Map:   0%|          | 0/20596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3499 [00:00<?, ? examples/s]

Map:   0%|          | 0/5753 [00:00<?, ? examples/s]

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Cell Type,Tissue,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0145,0.010314,"{'precision': 0.6827956989247311, 'recall': 0.7448680351906158, 'f1': 0.7124824684431976, 'number': 341}","{'precision': 0.7565922920892495, 'recall': 0.8966346153846154, 'f1': 0.8206820682068207, 'number': 416}",0.724855,0.828269,0.77312,0.996001
2,0.0091,0.01009,"{'precision': 0.6585956416464891, 'recall': 0.7976539589442815, 'f1': 0.7214854111405835, 'number': 341}","{'precision': 0.8013245033112583, 'recall': 0.8725961538461539, 'f1': 0.8354430379746836, 'number': 416}",0.733256,0.838838,0.782502,0.99616
3,0.0057,0.010488,"{'precision': 0.7017994858611826, 'recall': 0.8005865102639296, 'f1': 0.7479452054794522, 'number': 341}","{'precision': 0.8314350797266514, 'recall': 0.8774038461538461, 'f1': 0.8538011695906431, 'number': 416}",0.770531,0.842801,0.805047,0.996539


         Level  F1-Score  Precision    Recall
0            O  0.998376   0.999057  0.997696
1  B-CELL_TYPE  0.775036   0.758523  0.792285
2  I-CELL_TYPE  0.825986   0.816514  0.835681
3     B-TISSUE  0.855556   0.793814  0.927711
4     I-TISSUE  0.836735   0.759259  0.931818




         Level  F1-Score  Precision    Recall
0            O  0.998438   0.999020  0.997857
1  B-CELL_TYPE  0.774629   0.710396  0.851632
2  I-CELL_TYPE  0.833724   0.831776  0.835681
3     B-TISSUE  0.870070   0.838926  0.903614
4     I-TISSUE  0.847059   0.878049  0.818182




         Level  F1-Score  Precision    Recall
0            O  0.998544   0.998958  0.998129
1  B-CELL_TYPE  0.794944   0.754667  0.839763
2  I-CELL_TYPE  0.846512   0.838710  0.854460
3     B-TISSUE  0.888104   0.868664  0.908434
4     I-TISSUE  0.868132   0.840426  0.897727




         Level  F1-Score  Precision    Recall
0            O  0.996909   0.998637  0.995187
1  B-CELL_TYPE  0.818656   0.745249  0.908104
2  I-CELL_TYPE  0.871550   0.848140  0.896288
3     B-TISSUE  0.869354   0.833333  0.908629
4     I-TISSUE  0.776471   0.733333  0.825000
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract


Map:   0%|          | 0/20596 [00:00<?, ? examples/s]

Map:   0%|          | 0/3499 [00:00<?, ? examples/s]

Map:   0%|          | 0/5753 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Cell Type,Tissue,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0144,0.009903,"{'precision': 0.6878306878306878, 'recall': 0.7624633431085044, 'f1': 0.7232267037552156, 'number': 341}","{'precision': 0.8080357142857143, 'recall': 0.8701923076923077, 'f1': 0.837962962962963, 'number': 416}",0.753027,0.821664,0.78585,0.996148
2,0.0083,0.009919,"{'precision': 0.7191601049868767, 'recall': 0.8035190615835777, 'f1': 0.7590027700831025, 'number': 341}","{'precision': 0.7986725663716814, 'recall': 0.8677884615384616, 'f1': 0.8317972350230414, 'number': 416}",0.762305,0.838838,0.798742,0.99649
3,0.0044,0.011077,"{'precision': 0.7342465753424657, 'recall': 0.7859237536656891, 'f1': 0.7592067988668555, 'number': 341}","{'precision': 0.8216704288939052, 'recall': 0.875, 'f1': 0.8474970896391152, 'number': 416}",0.782178,0.834875,0.807668,0.996612


         Level  F1-Score  Precision    Recall
0            O  0.998420   0.998822  0.998018
1  B-CELL_TYPE  0.766764   0.753582  0.780415
2  I-CELL_TYPE  0.825688   0.807175  0.845070
3     B-TISSUE  0.873832   0.848073  0.901205
4     I-TISSUE  0.839378   0.771429  0.920455




         Level  F1-Score  Precision    Recall
0            O  0.998637   0.999045  0.998228
1  B-CELL_TYPE  0.793201   0.758808  0.830861
2  I-CELL_TYPE  0.829493   0.814480  0.845070
3     B-TISSUE  0.875000   0.841871  0.910843
4     I-TISSUE  0.845238   0.887500  0.806818




         Level  F1-Score  Precision    Recall
0            O  0.998662   0.998884  0.998439
1  B-CELL_TYPE  0.790765   0.769663  0.813056
2  I-CELL_TYPE  0.847458   0.875000  0.821596
3     B-TISSUE  0.879250   0.856164  0.903614
4     I-TISSUE  0.852459   0.821053  0.886364




         Level  F1-Score  Precision    Recall
0            O  0.997243   0.998414  0.996075
1  B-CELL_TYPE  0.829135   0.782776  0.881331
2  I-CELL_TYPE  0.873091   0.843337  0.905022
3     B-TISSUE  0.892929   0.888442  0.897462
4     I-TISSUE  0.767624   0.659193  0.918750


In [10]:
test_metrics

{'bert-base-uncased': {'test_loss': 0.02012392319738865,
  'test_CELL_TYPE': {'precision': 0.7203182374541004,
   'recall': 0.8479827089337176,
   'f1': 0.7789543348775645,
   'number': 1388},
  'test_TISSUE': {'precision': 0.775096525096525,
   'recall': 0.8219037871033776,
   'f1': 0.7978142076502731,
   'number': 977},
  'test_overall_precision': 0.7415730337078652,
  'test_overall_recall': 0.8372093023255814,
  'test_overall_f1': 0.7864945382323734,
  'test_overall_accuracy': 0.9924709651581898,
  'test_runtime': 23.2553,
  'test_samples_per_second': 247.384,
  'test_steps_per_second': 30.961},
 'google/electra-base-discriminator': {'test_loss': 0.02117023430764675,
  'test_CELL_TYPE': {'precision': 0.7675033025099075,
   'recall': 0.8371757925072046,
   'f1': 0.8008270158511371,
   'number': 1388},
  'test_TISSUE': {'precision': 0.8149253731343283,
   'recall': 0.8382804503582395,
   'f1': 0.8264379414732594,
   'number': 977},
  'test_overall_precision': 0.7864231838030965,
  'te