In [None]:
from datasets import load_dataset

import numpy as np

dataset = load_dataset(
    "tner/bionlp2004", 
    cache_dir='./data_cache'
)

print(f'The dataset is a dictionary with {len(dataset)} splits: \n\n{dataset}')

In [None]:
train_sentences_ner = [item['tokens'] for item in dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['train']]

val_sentences_ner = [item['tokens'] for item in dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['validation']]

test_sentences_ner = [item['tokens'] for item in dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['test']]

In [None]:
print(f'Number of training sentences = {len(train_sentences_ner)}')
print(f'Number of validation sentences = {len(val_sentences_ner)}')
print(f'Number of test sentences = {len(test_sentences_ner)}')

In [None]:
print(f'What does one instance look like from the training set? \n\n{train_sentences_ner[234]}')
print(f'...and here is its corresponding label \n\n{train_labels_ner[234]}')

In [None]:
print(f'Number of unique labels: {np.unique(np.concatenate(train_labels_ner))}')

In [None]:
# mapping from labels to the tags

id2label = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

mapping = {v:k for k, v in id2label.items()}
print(mapping[5])

In [None]:
train_set = [list(zip(train_sentences_ner[index],[mapping[int(i)]for i in train_labels_ner[index]]))for index, sentence in enumerate(train_sentences_ner)]
test_set = [list(zip(test_sentences_ner[index],[mapping[int(i)]for i in test_labels_ner[index]]))for index, sentence in enumerate(test_sentences_ner)]
test_tokens = [tok for tok in test_sentences_ner]
test_tags = [[mapping[int(i)]for i in item] for item in test_labels_ner]
print(test_set[0])

In [None]:
from nltk.tag import CRFTagger

tagger = CRFTagger(verbose= True)
tagger.train(train_set,'model.crf.my_tagger')

In [None]:
predicted_tags = tagger.tag_sents(test_tokens)
print(predicted_tags)

In [39]:
def extract_spans(tagged_sents):
    """
    Extract a list of tagged spans for each named entity type, 
    where each span is represented by a tuple containing the 
    start token and end token indexes.
    
    returns: a dictionary containing a list of spans for each entity type.
    """
    spans = {}
        
    for sidx, sent in enumerate(tagged_sents):
        start = -1
        entity_type = None
        for i, (tok, lab) in enumerate(sent):
            if 'B-' in lab:
                start = i
                end = i + 1
                entity_type = lab[2:]
            elif 'I-' in lab:
                end = i + 1
            elif lab == 'O' and start >= 0:
                
                if entity_type not in spans:
                    spans[entity_type] = []
                
                spans[entity_type].append((start, end, sidx))
                start = -1      
        # Sometimes an I-token is the last token in the sentence, so we still have to add the span to the list
        if start >= 0:    
            if entity_type not in spans:
                spans[entity_type] = []
                
            spans[entity_type].append((start, end, sidx))
                
    return spans


def cal_span_level_f1(test_sents, test_sents_with_pred):
    # get a list of spans from the test set labels
    true_spans = extract_spans(test_sents)

    # get a list of spans predicted by our tagger
    predicted_spans = extract_spans(test_sents_with_pred)
    
    # compute the metrics for each class:
    f1_score_for_each_class = []
    
    named_entity_types = true_spans.keys()  # get the list of named entity types (not the tags)
    
    score_printer(named_entity_types, true_spans, predicted_spans, f1_score_for_each_class)
    
def score_printer(named_entity_types, true_spans, predicted_spans, f1_score_for_each_class):
    for named_entity_type in named_entity_types:
        # compute the confusion matrix
        true_positive = 0
        false_positive = 0
        
        for span in predicted_spans[named_entity_type]:
            if span in true_spans[named_entity_type]:
                true_positive += 1
            else:
                false_positive += 1
                
        false_negative = 0
        for span in true_spans[named_entity_type]:
            if span not in predicted_spans[named_entity_type]:
                false_negative += 1
                
        if false_positive + true_positive  == 0:
            precision = 0
        else:
            precision = true_positive / float(true_positive+ false_positive)
            
        if true_positive + false_negative== 0:
            recall = 0
        else:
            recall = true_positive / float(true_positive + false_negative)
        
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
            
        f1_score_for_each_class.append(f1)
        print(f'F1 score for class {named_entity_type} = {f1}')
        
    print(f'Macro-average f1 score = {np.mean(f1_score_for_each_class)}')

cal_span_level_f1(test_set, predicted_tags)

F1 score for class protein = 0.6906707201216036
F1 score for class cell_type = 0.7072816942268639
F1 score for class DNA = 0.6420369445831253
F1 score for class cell_line = 0.5381526104417671
F1 score for class RNA = 0.6233766233766234
Macro-average f1 score = 0.6403037185499967


In [35]:
import re, unicodedata

class CustomCRFTagger(CRFTagger):
    _current_tokens = None
    
    def _get_features(self, tokens, idx):
            """
            Extract basic features about this word including
                - Current word
                - is it capitalized?
                - Does it have punctuation?
                - Does it have a number?
                - Suffixes up to length 3

            Note that : we might include feature over previous word, next word etc.

            :return: a list which contains the features
            :rtype: list(str)
            """
            token = tokens[idx]

            feature_list = []

            if not token:
                return feature_list

            # Capitalization
            if token[0].isupper():
                feature_list.append("CAPITALIZATION")

            # Number
            if re.search(self._pattern, token) is not None:
                feature_list.append("HAS_NUM")

            # Punctuation
            punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
            if all(unicodedata.category(x) in punc_cat for x in token):
                feature_list.append("PUNCTUATION")

            # Suffix up to length 3
            if len(token) > 1:
                feature_list.append("SUF_" + token[-1:])
            if len(token) > 2:
                feature_list.append("SUF_" + token[-2:])
            if len(token) > 3:
                feature_list.append("SUF_" + token[-3:])

                
            # Current word
            feature_list.append("WORD_" + token)
            
            ### WRITE YOUR OWN CODE HERE ###
            if idx > 0:
                feature_list.append("PREVWORD_" + tokens[idx-1])
            if idx < len(tokens)-1:
                feature_list.append("NEXTWORD_" + tokens[idx+1])
                
            ####

            return feature_list
                

In [None]:
# Train a CRF NER tagger
def train_CustomCRF_NER_tagger(train_set):
    ### WRITE YOUR OWN CODE HERE
    tagger = CustomCRFTagger()
    tagger.train(train_set, 'model.crf.cust_tagger')
    return tagger  # return the trained model

tagger = train_CustomCRF_NER_tagger(train_set)

In [None]:
predicted_tags = tagger.tag_sents(test_tokens)
cal_span_level_f1(test_set, predicted_tags)

In [None]:
# *** Improve the CRF NER tagger using parts of speech (see lab 5) as additional features.
from nltk.tag import pos_tag
import nltk as nltk
nltk.download('averaged_perceptron_tagger')
class CRFTaggerWithPOS(CustomCRFTagger):
    _current_tokens = None
    
    def _get_features(self, tokens, index):
        """
        Extract the features for a token and append the POS tag as an additional feature.
        """
        basic_features = super()._get_features(tokens, index)
        
        # Get the pos tags for the current sentence and save it
        if tokens != self._current_tokens:
            self._pos_tagged_tokens = pos_tag(tokens)
            self._current_tokens = tokens
            
            
        ### WRITE YOUR OWN CODE HERE
        basic_features.append(self._pos_tagged_tokens[index][1])
        ###
        
        return basic_features

In [None]:
# Train a CRF NER tagger
def train_CRF_NER_tagger_with_POS(train_set):
    ### WRITE YOUR OWN CODE HERE
    tagger = CRFTaggerWithPOS()
    tagger.train(train_set, 'model.crf.tagger')
    return tagger  # return the trained model

tagger = train_CRF_NER_tagger_with_POS(train_set)

In [None]:
predicted_tags = tagger.tag_sents(test_tokens)
cal_span_level_f1(test_set, predicted_tags)