In [1]:
from datasets import load_dataset

import numpy as np

dataset = load_dataset(
    "tner/bionlp2004", 
    cache_dir='./data_cache'
)

print(f'The dataset is a dictionary with {len(dataset)} splits: \n\n{dataset}')

Reusing dataset bio_nlp2004 (./data_cache\tner___bio_nlp2004\bionlp2004\1.0.0\9f41d3f0270b773c2762dee333ae36c29331e2216114a57081f77639fdb5e904)


  0%|          | 0/3 [00:00<?, ?it/s]

The dataset is a dictionary with 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 16619
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1927
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 3856
    })
})


In [2]:
train_sentences_ner = [item['tokens'] for item in dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['train']]

val_sentences_ner = [item['tokens'] for item in dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['validation']]

test_sentences_ner = [item['tokens'] for item in dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['test']]

In [3]:
print(f'Number of training sentences = {len(train_sentences_ner)}')
print(f'Number of validation sentences = {len(val_sentences_ner)}')
print(f'Number of test sentences = {len(test_sentences_ner)}')

Number of training sentences = 16619
Number of validation sentences = 1927
Number of test sentences = 3856


In [4]:
print(f'What does one instance look like from the training set? \n\n{train_sentences_ner[234]}')
print(f'...and here is its corresponding label \n\n{train_labels_ner[234]}')

What does one instance look like from the training set? 

['Hence', ',', 'PPAR', 'can', 'positively', 'or', 'negatively', 'influence', 'TH', 'action', 'depending', 'on', 'TRE', 'structure', 'and', 'THR', 'isotype', '.']
...and here is its corresponding label 

['0', '0', '3', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '3', '4', '0']


In [5]:
print(f'Number of unique labels: {np.unique(np.concatenate(train_labels_ner))}')

Number of unique labels: ['0' '1' '10' '2' '3' '4' '5' '6' '7' '8' '9']


In [30]:
# mapping from labels to the tags

id2label = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

mapping = {v:k for k, v in id2label.items()}
print(mapping[5])

B-cell_type


In [38]:
train_set = [list(zip(,[mapping[int(i)]for i in train_labels_ner[index]]))for index, sentence in enumerate(train_sentences_ner)]
test_set = [list(zip(test_labels_ner[index],test_sentences_ner[index]))for index, sentence in enumerate(test_sentences_ner)]
test_tokens = [tok for tok in test_sentences_ner]
test_tags = [[mapping[int(i)]for i in item] for item in test_labels_ner]
print(train_set[0])

[('O', 'Since'), ('B-cell_line', 'HUVECs'), ('O', 'released'), ('O', 'superoxide'), ('O', 'anions'), ('O', 'in'), ('O', 'response'), ('O', 'to'), ('O', 'TNF'), ('O', ','), ('O', 'and'), ('O', 'H2O2'), ('O', 'induces'), ('B-protein', 'VCAM-1'), ('O', ','), ('O', 'PDTC'), ('O', 'may'), ('O', 'act'), ('O', 'as'), ('O', 'a'), ('O', 'radical'), ('O', 'scavenger'), ('O', '.')]


In [8]:
from nltk.tag import CRFTagger

tagger = CRFTagger(verbose= True)
#tagger.train(train_set,'model.crf.my_tagger')
train_set

[[('0', 'Since'),
  ('7', 'HUVECs'),
  ('0', 'released'),
  ('0', 'superoxide'),
  ('0', 'anions'),
  ('0', 'in'),
  ('0', 'response'),
  ('0', 'to'),
  ('0', 'TNF'),
  ('0', ','),
  ('0', 'and'),
  ('0', 'H2O2'),
  ('0', 'induces'),
  ('3', 'VCAM-1'),
  ('0', ','),
  ('0', 'PDTC'),
  ('0', 'may'),
  ('0', 'act'),
  ('0', 'as'),
  ('0', 'a'),
  ('0', 'radical'),
  ('0', 'scavenger'),
  ('0', '.')],
 [('0', 'Although'),
  ('3', 'ICAM-1'),
  ('0', 'induction'),
  ('0', 'was'),
  ('0', 'unaffected'),
  ('0', ','),
  ('0', 'inhibitors'),
  ('0', 'of'),
  ('3', 'NADPH'),
  ('4', 'oxidase'),
  ('0', '('),
  ('0', 'apocynin'),
  ('0', ')'),
  ('0', 'or'),
  ('3', 'cytochrome'),
  ('4', 'P-450'),
  ('0', '('),
  ('0', 'SKF525a'),
  ('0', ')'),
  ('0', 'suppressed'),
  ('3', 'VCAM-1'),
  ('0', 'induction'),
  ('0', 'by'),
  ('0', 'TNF'),
  ('0', ','),
  ('0', 'revealing'),
  ('0', 'that'),
  ('0', 'several'),
  ('0', 'radical-generating'),
  ('0', 'systems'),
  ('0', 'are'),
  ('0', 'involved')

In [None]:
predicted_tags = tagger.tag_sents(test_tokens)
print(predicted_tags)

In [None]:
# Evaluate tagger model
def extract_spans(tagged_sents):
    """
    Extract a list of tagged spans for each named entity type, 
    where each span is represented by a tuple containing the 
    start token and end token indexes.
    
    returns: a dictionary containing a list of spans for each entity type.
    """
    spans = {}
        
    for sidx, sent in enumerate(tagged_sents):
        start = -1
        entity_type = None
        for i, (tok, lab) in enumerate(sent):
            if 'B-' in lab:
                start = i
                end = i + 1
                entity_type = lab[2:]
            elif 'I-' in lab:
                end = i + 1
            elif lab == 'O' and start >= 0:
                
                if entity_type not in spans:
                    spans[entity_type] = []
                
                spans[entity_type].append((start, end, sidx))
                start = -1      
        # Sometimes an I-token is the last token in the sentence, so we still have to add the span to the list
        if start >= 0:    
            if entity_type not in spans:
                spans[entity_type] = []
                
            spans[entity_type].append((start, end, sidx))
                
    return spans


def cal_span_level_f1(test_sents, test_sents_with_pred):
    # get a list of spans from the test set labels
    gold_spans = extract_spans(test_sents)

    # get a list of spans predicted by our tagger
    pred_spans = extract_spans(test_sents_with_pred)
    
    # compute the metrics for each class:
    f1_per_class = []
    
    ne_types = gold_spans.keys()  # get the list of named entity types (not the tags)
    
    for ne_type in ne_types:
        # compute the confusion matrix
        true_pos = 0
        false_pos = 0
        
        for span in pred_spans[ne_type]:
            if span in gold_spans[ne_type]:
                true_pos += 1
            else:
                false_pos += 1
                
        false_neg = 0
        for span in gold_spans[ne_type]:
            if span not in pred_spans[ne_type]:
                false_neg += 1
                
        if true_pos + false_pos == 0:
            precision = 0
        else:
            precision = true_pos / float(true_pos + false_pos)
            
        if true_pos + false_neg == 0:
            recall = 0
        else:
            recall = true_pos / float(true_pos + false_neg)
        
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
            
        f1_per_class.append(f1)
        print(f'F1 score for class {ne_type} = {f1}')
        
    print(f'Macro-average f1 score = {np.mean(f1_per_class)}')

cal_span_level_f1(test_set, predicted_tags)