In [1]:
from datasets import load_dataset
import nltk as nltk
from nltk.tag import pos_tag
from nltk.tag import CRFTagger
import numpy as np
import re, unicodedata


nltk.download('averaged_perceptron_tagger')
dataset = load_dataset(
    "tner/bionlp2004", 
    cache_dir='./data_cache'
)

print(f'The dataset is a dictionary with {len(dataset)} splits: \n\n{dataset}')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Reusing dataset bio_nlp2004 (./data_cache\tner___bio_nlp2004\bionlp2004\1.0.0\9f41d3f0270b773c2762dee333ae36c29331e2216114a57081f77639fdb5e904)


  0%|          | 0/3 [00:00<?, ?it/s]

The dataset is a dictionary with 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 16619
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1927
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 3856
    })
})


In [3]:
# Formatting the dataset splits for the tagger.

train_sentences_ner = [item['tokens'] for item in dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['train']]

val_sentences_ner = [item['tokens'] for item in dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['validation']]

test_sentences_ner = [item['tokens'] for item in dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['test']]

In [4]:
print(f'Number of training sentences = {len(train_sentences_ner)}')
print(f'Number of validation sentences = {len(val_sentences_ner)}')
print(f'Number of test sentences = {len(test_sentences_ner)}')

Number of training sentences = 16619
Number of validation sentences = 1927
Number of test sentences = 3856


In [5]:
print(f'What does one instance look like from the training set? \n\n{train_sentences_ner[234]}')
print(f'...and here is its corresponding label \n\n{train_labels_ner[234]}')

What does one instance look like from the training set? 

['Hence', ',', 'PPAR', 'can', 'positively', 'or', 'negatively', 'influence', 'TH', 'action', 'depending', 'on', 'TRE', 'structure', 'and', 'THR', 'isotype', '.']
...and here is its corresponding label 

['0', '0', '3', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '3', '4', '0']


In [6]:
print(f'Number of unique labels: {np.unique(np.concatenate(train_labels_ner))}')

Number of unique labels: ['0' '1' '10' '2' '3' '4' '5' '6' '7' '8' '9']


In [7]:
# mapping from labels to the tags

all_labels = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

mapping = {v:k for k, v in all_labels.items()}
print(mapping)

{0: 'O', 1: 'B-DNA', 2: 'I-DNA', 3: 'B-protein', 4: 'I-protein', 5: 'B-cell_type', 6: 'I-cell_type', 7: 'B-cell_line', 8: 'I-cell_line', 9: 'B-RNA', 10: 'I-RNA'}


In [8]:
train_set = [list(zip(train_sentences_ner[index],[mapping[int(i)]for i in train_labels_ner[index]]))for index, sentence in enumerate(train_sentences_ner)]

val_set = [list(zip(val_sentences_ner[index],[mapping[int(i)]for i in val_labels_ner[index]]))for index, sentence in enumerate(val_sentences_ner)]
val_tokens = [tok for tok in val_sentences_ner]
val_tags = [[mapping[int(i)]for i in item] for item in val_labels_ner]

test_set = [list(zip(test_sentences_ner[index],[mapping[int(i)]for i in test_labels_ner[index]]))for index, sentence in enumerate(test_sentences_ner)]
test_tokens = [tok for tok in test_sentences_ner]

test_tags = [[mapping[int(i)]for i in item] for item in test_labels_ner]
print(val_set[0])

[('IL-2', 'B-DNA'), ('gene', 'I-DNA'), ('expression', 'O'), ('and', 'O'), ('NF-kappa', 'B-protein'), ('B', 'I-protein'), ('activation', 'O'), ('through', 'O'), ('CD28', 'B-protein'), ('requires', 'O'), ('reactive', 'O'), ('oxygen', 'O'), ('production', 'O'), ('by', 'O'), ('5-lipoxygenase', 'B-protein'), ('.', 'O')]


In [9]:
model = CRFTagger(verbose= True)
model.train(train_set,'model.crf.my_tagger')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 41166
Seconds required: 0.231

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 670701.292937
Feature norm: 5.000000
Error norm: 153439.508663
Active features: 41166
Line search trials: 2
Line search step: 0.000016
Seconds required for this iteration: 0.742

***** Iteration #2 *****
Loss: 450069.016166
Feature norm: 3.656978
Error norm: 128870.363823
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.251

***** Iteration #3 *****
Loss: 371186.909816
Feature norm: 3.071075
Error norm: 51633.075776
Active features: 41166
Line search trials: 2
Line search step: 0.120890
Seconds 

***** Iteration #44 *****
Loss: 79272.580763
Feature norm: 55.254515
Error norm: 2355.376604
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #45 *****
Loss: 78921.787259
Feature norm: 56.154482
Error norm: 6459.665318
Active features: 41166
Line search trials: 2
Line search step: 0.421550
Seconds required for this iteration: 0.498

***** Iteration #46 *****
Loss: 78460.630137
Feature norm: 56.704766
Error norm: 3746.629139
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.255

***** Iteration #47 *****
Loss: 78012.679385
Feature norm: 57.474117
Error norm: 2405.390143
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #48 *****
Loss: 77703.264361
Feature norm: 58.123588
Error norm: 2954.226033
Active features: 41166
Line search trials: 1
Line search step: 1.000000

***** Iteration #89 *****
Loss: 65878.439850
Feature norm: 86.952220
Error norm: 1672.292205
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.258

***** Iteration #90 *****
Loss: 65790.132987
Feature norm: 87.469363
Error norm: 3240.480993
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.274

***** Iteration #91 *****
Loss: 65498.824711
Feature norm: 88.505596
Error norm: 1473.856191
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.274

***** Iteration #92 *****
Loss: 65403.235706
Feature norm: 88.752375
Error norm: 1086.827478
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.264

***** Iteration #93 *****
Loss: 65192.893147
Feature norm: 89.695897
Error norm: 877.756609
Active features: 41166
Line search trials: 1
Line search step: 1.000000


***** Iteration #133 *****
Loss: 61887.345229
Feature norm: 95.029603
Error norm: 547.603326
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #134 *****
Loss: 61845.239357
Feature norm: 95.135241
Error norm: 524.488432
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.255

***** Iteration #135 *****
Loss: 61769.456042
Feature norm: 95.326148
Error norm: 481.065519
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.258

***** Iteration #136 *****
Loss: 61753.629992
Feature norm: 95.361149
Error norm: 1109.236315
Active features: 41166
Line search trials: 2
Line search step: 0.207128
Seconds required for this iteration: 0.507

***** Iteration #137 *****
Loss: 61717.559832
Feature norm: 95.411800
Error norm: 641.703506
Active features: 41166
Line search trials: 1
Line search step: 1.00000

***** Iteration #179 *****
Loss: 60761.266564
Feature norm: 98.625826
Error norm: 262.811699
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #180 *****
Loss: 60747.916512
Feature norm: 98.783004
Error norm: 858.351229
Active features: 41166
Line search trials: 2
Line search step: 0.412630
Seconds required for this iteration: 0.502

***** Iteration #181 *****
Loss: 60732.968479
Feature norm: 98.929606
Error norm: 230.201021
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.252

***** Iteration #182 *****
Loss: 60729.291124
Feature norm: 99.002135
Error norm: 238.177338
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.252

***** Iteration #183 *****
Loss: 60721.493531
Feature norm: 99.123863
Error norm: 258.612929
Active features: 41166
Line search trials: 1
Line search step: 1.000000

***** Iteration #224 *****
Loss: 60449.043434
Feature norm: 102.118008
Error norm: 406.216460
Active features: 41166
Line search trials: 2
Line search step: 0.336267
Seconds required for this iteration: 0.512

***** Iteration #225 *****
Loss: 60441.311281
Feature norm: 102.164310
Error norm: 227.261232
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #226 *****
Loss: 60436.957960
Feature norm: 102.189405
Error norm: 162.461860
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #227 *****
Loss: 60431.994141
Feature norm: 102.229023
Error norm: 200.825565
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #228 *****
Loss: 60430.781840
Feature norm: 102.237557
Error norm: 432.012674
Active features: 41166
Line search trials: 2
Line search step: 0.2

***** Iteration #267 *****
Loss: 60351.153783
Feature norm: 102.961499
Error norm: 95.071355
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.259

***** Iteration #268 *****
Loss: 60350.044737
Feature norm: 102.970749
Error norm: 79.054395
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.261

***** Iteration #269 *****
Loss: 60349.177344
Feature norm: 102.991250
Error norm: 161.893382
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.256

***** Iteration #270 *****
Loss: 60348.080772
Feature norm: 103.032833
Error norm: 122.698086
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #271 *****
Loss: 60347.248410
Feature norm: 103.055224
Error norm: 112.393458
Active features: 41166
Line search trials: 1
Line search step: 1.000

***** Iteration #312 *****
Loss: 60322.029045
Feature norm: 103.473678
Error norm: 50.150578
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.251

***** Iteration #313 *****
Loss: 60321.810351
Feature norm: 103.477890
Error norm: 60.050839
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #314 *****
Loss: 60321.483813
Feature norm: 103.485522
Error norm: 50.531721
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.257

***** Iteration #315 *****
Loss: 60321.229654
Feature norm: 103.493157
Error norm: 156.605374
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.256

***** Iteration #316 *****
Loss: 60320.755105
Feature norm: 103.496299
Error norm: 59.234207
Active features: 41166
Line search trials: 1
Line search step: 1.00000

***** Iteration #358 *****
Loss: 60313.491568
Feature norm: 103.607141
Error norm: 23.297918
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #359 *****
Loss: 60313.400680
Feature norm: 103.611081
Error norm: 24.094067
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #360 *****
Loss: 60313.265214
Feature norm: 103.616413
Error norm: 36.882572
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.252

***** Iteration #361 *****
Loss: 60313.143963
Feature norm: 103.620500
Error norm: 44.639552
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.252

***** Iteration #362 *****
Loss: 60313.042984
Feature norm: 103.620456
Error norm: 24.727746
Active features: 41166
Line search trials: 1
Line search step: 1.000000

In [10]:
predicted_tags = model.tag_sents(val_tokens)
print(predicted_tags)

[[('IL-2', 'B-DNA'), ('gene', 'I-DNA'), ('expression', 'O'), ('and', 'O'), ('NF-kappa', 'B-protein'), ('B', 'I-protein'), ('activation', 'O'), ('through', 'O'), ('CD28', 'B-protein'), ('requires', 'O'), ('reactive', 'O'), ('oxygen', 'O'), ('production', 'O'), ('by', 'O'), ('5-lipoxygenase', 'B-protein'), ('.', 'O')], [('Activation', 'O'), ('of', 'O'), ('the', 'O'), ('CD28', 'B-protein'), ('surface', 'I-protein'), ('receptor', 'I-protein'), ('provides', 'O'), ('a', 'O'), ('major', 'O'), ('costimulatory', 'O'), ('signal', 'O'), ('for', 'O'), ('T', 'O'), ('cell', 'O'), ('activation', 'O'), ('resulting', 'O'), ('in', 'O'), ('enhanced', 'O'), ('production', 'O'), ('of', 'O'), ('interleukin-2', 'B-protein'), ('(', 'O'), ('IL-2', 'B-protein'), (')', 'O'), ('and', 'O'), ('cell', 'O'), ('proliferation', 'O'), ('.', 'O')], [('In', 'O'), ('primary', 'B-cell_type'), ('T', 'I-cell_type'), ('lymphocytes', 'I-cell_type'), ('we', 'O'), ('show', 'O'), ('that', 'O'), ('CD28', 'B-protein'), ('ligation', 

In [11]:
def span_fill(index, token, label, spans,start,id_):
    # Check if beginning
    if 'B-' in label:
        start = index
        ending = index + 1
        named_entity_type = label[2:]
    # check if inside
    elif 'I-' in label:
        ending = index + 1
    # check if not entity type
    elif start >= 0 and label == 'O':
        if named_entity_type not in spans:
            spans[named_entity_type] = []
        spans[named_entity_type].append((start, ending, id_))
        start = -1   
    if start >= 0:    
        if named_entity_type not in spans:
            spans[named_entity_type] = []
        spans[named_entity_type].append((start, ending, id_))

In [12]:
def score_printer(named_entity_types, true_spans, predicted_spans, F1_score_for_each_class):
    
    # Manually calculating F1, precision, recall. 
    for named_entity_type in named_entity_types:
        # We loop through all the named entity tpes
        # set TP, FN, and FP to zero.
        true_positive = 0
        false_positive = 0
        false_negative = 0
        
        for span in true_spans[named_entity_type]:
            # check if current true span not in the predicted spans
            if span not in predicted_spans[named_entity_type]:
                # If so...increment false negative value.
                false_negative = false_negative + 1
        
        for span in predicted_spans[named_entity_type]:
            # check if current predicted span in the true spans
            if span in true_spans[named_entity_type]:
                # If so, increment true positive val
                true_positive = true_positive + 1
            else:
                # otherwise increment false negative val
                false_positive = false_positive + 1       
        
            
        if true_positive + false_negative== 0:
            # set recall
            recall = 0
        else:
            # calculate recall using TP and FN
            recall = true_positive / float(true_positive + false_negative)

            
        if true_positive + false_positive == 0:
            # Set precision
            precision = 0
        else:
            # calculate precision using FP and TP
            precision = true_positive / float(false_positive + true_positive)
            

        if recall + precision == 0:
            # Set F1 score
            F1 = 0
        else:
            # Calculate F1 using precision and recall
            F1 = 2 * precision * recall / (precision + recall)
            

        F1_score_for_each_class.append(F1)
        print('F1 score for Class: {} = {}'.format(named_entity_type, F1))
        
    print('Macro averaged F1 score for all classes: {}'.format(np.mean(F1_score_for_each_class)))

In [13]:

def get_spans(tagged_sentences):
    # Create a dict to hold spans
    spans_dict = {}   
    for id_, sentence in enumerate(tagged_sentences):
        start = -1
        entity_type = None
        for index, (token, label) in enumerate(sentence):
            span_fill(index, token, label, spans_dict, start, id_)  
    return spans_dict

def get_f1_scores(test_sents, test_sents_with_pred):
    true_spans = get_spans(test_sents)
    predicted_spans = get_spans(test_sents_with_pred)
    # A list to hold F1 scores
    F1_score_for_each_class = []
    # Set named entity types
    named_entity_types = true_spans.keys()
    
    score_printer(named_entity_types, true_spans, predicted_spans, F1_score_for_each_class)
    

In [14]:
# Get F1 scores for validation set
get_f1_scores(val_set, predicted_tags)

F1 score for Class: DNA = 0.6493860845839017
F1 score for Class: protein = 0.7885906040268457
F1 score for Class: cell_type = 0.6825657894736842
F1 score for Class: cell_line = 0.6267605633802816
F1 score for Class: RNA = 0.7008547008547009
Macro averaged F1 score for all classes: 0.6896315484638829


In [16]:
# Now we add previous and next words as features.
class Current_next_previous_word_CRFTagger(CRFTagger):
    def _get_features(self, toks, i):
            tok = toks[i]
            # Get features from original method
            features = super()._get_features(toks,i)
            # Append the current word
            features.append("CURRENT_WORD" + tok)
            if i < len(toks)-1:
                # Append the next word
                features.append("NEXT_WORD_" + toks[i+1])
                # Append the previous word
            if i > 0:
                features.append("PREVIOUS_WORD_" + toks[i-1])
            return features
                

In [17]:
# Train a new model that uses the new features
model = Current_next_previous_word_CRFTagger(verbose=True)
model.train(train_set, 'model.crf.next_previous_word_CRFTagger')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 127101
Seconds required: 0.532

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 666989.591094
Feature norm: 5.000000
Error norm: 155238.090407
Active features: 127101
Line search trials: 2
Line search step: 0.000016
Seconds required for this iteration: 0.855

***** Iteration #2 *****
Loss: 445965.511750
Feature norm: 3.695703
Error norm: 131668.452309
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #3 *****
Loss: 361830.154566
Feature norm: 3.111198
Error norm: 59014.424742
Active features: 127101
Line search trials: 2
Line search step: 0.121885
Seco

***** Iteration #43 *****
Loss: 62740.297617
Feature norm: 52.414545
Error norm: 3956.276671
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.295

***** Iteration #44 *****
Loss: 61834.084526
Feature norm: 53.647909
Error norm: 2455.387590
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.290

***** Iteration #45 *****
Loss: 61319.144302
Feature norm: 54.464859
Error norm: 2925.153843
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #46 *****
Loss: 61116.589214
Feature norm: 54.841059
Error norm: 4299.802011
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.292

***** Iteration #47 *****
Loss: 60927.258210
Feature norm: 54.751344
Error norm: 2896.832550
Active features: 127101
Line search trials: 1
Line search step: 1.0

***** Iteration #88 *****
Loss: 47626.455869
Feature norm: 82.017832
Error norm: 1263.356826
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.295

***** Iteration #89 *****
Loss: 47551.953431
Feature norm: 82.199366
Error norm: 860.524610
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #90 *****
Loss: 47495.497879
Feature norm: 82.370311
Error norm: 920.591843
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.293

***** Iteration #91 *****
Loss: 47313.185879
Feature norm: 82.982057
Error norm: 1003.424495
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.289

***** Iteration #92 *****
Loss: 47237.484538
Feature norm: 84.244956
Error norm: 2353.153272
Active features: 127101
Line search trials: 1
Line search step: 1.000

***** Iteration #131 *****
Loss: 44196.574206
Feature norm: 98.500616
Error norm: 453.405426
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #132 *****
Loss: 44164.364004
Feature norm: 98.908943
Error norm: 511.304493
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.291

***** Iteration #133 *****
Loss: 44126.591535
Feature norm: 99.317874
Error norm: 461.494334
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.290

***** Iteration #134 *****
Loss: 44073.519521
Feature norm: 100.052800
Error norm: 793.387716
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.290

***** Iteration #135 *****
Loss: 44023.281043
Feature norm: 100.688989
Error norm: 443.819898
Active features: 127101
Line search trials: 1
Line search step: 1

***** Iteration #172 *****
Loss: 43329.302713
Feature norm: 103.727044
Error norm: 347.652872
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.289

***** Iteration #173 *****
Loss: 43318.948588
Feature norm: 103.712593
Error norm: 309.345247
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.297

***** Iteration #174 *****
Loss: 43312.346192
Feature norm: 103.707371
Error norm: 334.942422
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.287

***** Iteration #175 *****
Loss: 43294.288034
Feature norm: 103.694035
Error norm: 370.690260
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.289

***** Iteration #176 *****
Loss: 43290.898273
Feature norm: 103.651678
Error norm: 815.545765
Active features: 127101
Line search trials: 1
Line search step

***** Iteration #216 *****
Loss: 43045.101718
Feature norm: 102.472264
Error norm: 114.537452
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.291

***** Iteration #217 *****
Loss: 43042.366489
Feature norm: 102.447478
Error norm: 198.319735
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.289

***** Iteration #218 *****
Loss: 43038.678277
Feature norm: 102.425090
Error norm: 147.268261
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.293

***** Iteration #219 *****
Loss: 43035.813370
Feature norm: 102.419312
Error norm: 122.968060
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.290

***** Iteration #220 *****
Loss: 43034.357712
Feature norm: 102.411485
Error norm: 356.100031
Active features: 127101
Line search trials: 2
Line search step

***** Iteration #260 *****
Loss: 42985.556510
Feature norm: 102.316682
Error norm: 85.820545
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #261 *****
Loss: 42985.171822
Feature norm: 102.318086
Error norm: 65.348789
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.292

***** Iteration #262 *****
Loss: 42984.533289
Feature norm: 102.327266
Error norm: 83.956601
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.293

***** Iteration #263 *****
Loss: 42984.190111
Feature norm: 102.340276
Error norm: 114.755363
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.287

***** Iteration #264 *****
Loss: 42983.865922
Feature norm: 102.342374
Error norm: 62.599666
Active features: 127101
Line search trials: 1
Line search step: 1.

***** Iteration #303 *****
Loss: 42969.840591
Feature norm: 102.813926
Error norm: 37.817682
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.299

***** Iteration #304 *****
Loss: 42969.718130
Feature norm: 102.820853
Error norm: 47.972833
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #305 *****
Loss: 42969.398960
Feature norm: 102.840042
Error norm: 59.850674
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.287

***** Iteration #306 *****
Loss: 42969.262308
Feature norm: 102.847789
Error norm: 107.820232
Active features: 127101
Line search trials: 2
Line search step: 0.273485
Seconds required for this iteration: 0.573

***** Iteration #307 *****
Loss: 42968.954460
Feature norm: 102.864458
Error norm: 65.597482
Active features: 127101
Line search trials: 1
Line search step: 1.

***** Iteration #348 *****
Loss: 42963.550295
Feature norm: 103.388677
Error norm: 22.212983
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.289

***** Iteration #349 *****
Loss: 42963.510456
Feature norm: 103.396408
Error norm: 34.353116
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #350 *****
Loss: 42963.445457
Feature norm: 103.407819
Error norm: 40.356132
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.289

***** Iteration #351 *****
Loss: 42963.327467
Feature norm: 103.430914
Error norm: 45.989691
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.291

***** Iteration #352 *****
Loss: 42963.276152
Feature norm: 103.437724
Error norm: 52.625862
Active features: 127101
Line search trials: 2
Line search step: 0.3

In [18]:
predicted_tags = model.tag_sents(val_tokens)
get_f1_scores(val_set, predicted_tags)

F1 score for Class: DNA = 0.692885550154662
F1 score for Class: protein = 0.8195902048975513
F1 score for Class: cell_type = 0.7516233766233765
F1 score for Class: cell_line = 0.7122381477398015
F1 score for Class: RNA = 0.6942148760330579
Macro averaged F1 score for all classes: 0.7341104310896898


In [24]:
class POSBasedTagger(Current_next_previous_word_CRFTagger):
    _tokens = None
    def _get_features(self, toks, i):
        # Adding POS tags as a feature on top of the current features
        features = super()._get_features(toks,i)
        # Set Pos tagged toks
        if toks != self._tokens:
            self._pos_tagged_toks = pos_tag(toks)
            self._tokens = toks
        features.append(self._pos_tagged_toks[i][1])
        return features

In [25]:
model = POSBasedTagger(verbose=True)
model.train(train_set, 'model.crf.POS_Based_Tagger')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 127364
Seconds required: 0.621

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 671004.167120
Feature norm: 5.000000
Error norm: 163790.150264
Active features: 127364
Line search trials: 2
Line search step: 0.000015
Seconds required for this iteration: 0.981

***** Iteration #2 *****
Loss: 440628.381558
Feature norm: 3.782112
Error norm: 141846.807427
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.320

***** Iteration #3 *****
Loss: 342479.559332
Feature norm: 3.219105
Error norm: 77815.878288
Active features: 127364
Line search trials: 2
Line search step: 0.123463
Seco

***** Iteration #46 *****
Loss: 59549.687104
Feature norm: 53.801044
Error norm: 9987.253825
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.326

***** Iteration #47 *****
Loss: 59021.177432
Feature norm: 53.715355
Error norm: 4345.511317
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.318

***** Iteration #48 *****
Loss: 58603.016645
Feature norm: 53.946000
Error norm: 2018.207705
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.356

***** Iteration #49 *****
Loss: 58052.277812
Feature norm: 54.651321
Error norm: 3619.680220
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.345

***** Iteration #50 *****
Loss: 57578.448222
Feature norm: 55.578043
Error norm: 3628.556654
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #86 *****
Loss: 47772.232253
Feature norm: 85.475919
Error norm: 1220.153968
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.336

***** Iteration #87 *****
Loss: 47715.252325
Feature norm: 85.249474
Error norm: 1827.690337
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.370

***** Iteration #88 *****
Loss: 47646.586461
Feature norm: 85.290605
Error norm: 835.448268
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.356

***** Iteration #89 *****
Loss: 47565.493581
Feature norm: 85.559154
Error norm: 930.981015
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.347

***** Iteration #90 *****
Loss: 47427.075393
Feature norm: 86.150776
Error norm: 1368.853875
Active features: 127364
Line search trials: 1
Line search step: 1.000

***** Iteration #125 *****
Loss: 44016.456233
Feature norm: 95.158773
Error norm: 711.159449
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.334

***** Iteration #126 *****
Loss: 43968.333216
Feature norm: 95.305743
Error norm: 769.406115
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.346

***** Iteration #127 *****
Loss: 43895.203502
Feature norm: 95.699442
Error norm: 1470.591629
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.351

***** Iteration #128 *****
Loss: 43800.261828
Feature norm: 95.802885
Error norm: 708.627461
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.315

***** Iteration #129 *****
Loss: 43731.365458
Feature norm: 95.840541
Error norm: 634.824578
Active features: 127364
Line search trials: 1
Line search step: 1.

***** Iteration #170 *****
Loss: 42600.238541
Feature norm: 95.711071
Error norm: 307.181577
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.331

***** Iteration #171 *****
Loss: 42594.966952
Feature norm: 95.708931
Error norm: 663.489332
Active features: 127364
Line search trials: 2
Line search step: 0.242494
Seconds required for this iteration: 0.641

***** Iteration #172 *****
Loss: 42584.800672
Feature norm: 95.706003
Error norm: 444.256970
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.329

***** Iteration #173 *****
Loss: 42572.532978
Feature norm: 95.702490
Error norm: 305.304764
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.353

***** Iteration #174 *****
Loss: 42554.656977
Feature norm: 95.697361
Error norm: 326.137651
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #216 *****
Loss: 42197.175449
Feature norm: 96.767181
Error norm: 166.129446
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.328

***** Iteration #217 *****
Loss: 42195.456320
Feature norm: 96.791466
Error norm: 156.940039
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.324

***** Iteration #218 *****
Loss: 42188.562934
Feature norm: 96.911713
Error norm: 217.458090
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.343

***** Iteration #219 *****
Loss: 42184.591104
Feature norm: 96.998525
Error norm: 204.967122
Active features: 127364
Line search trials: 2
Line search step: 0.523870
Seconds required for this iteration: 0.657

***** Iteration #220 *****
Loss: 42181.220471
Feature norm: 97.030602
Error norm: 155.446927
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #264 *****
Loss: 42074.607631
Feature norm: 99.889946
Error norm: 128.615721
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.342

***** Iteration #265 *****
Loss: 42073.151333
Feature norm: 99.962964
Error norm: 73.713836
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.335

***** Iteration #266 *****
Loss: 42072.364508
Feature norm: 100.019993
Error norm: 73.736965
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.343

***** Iteration #267 *****
Loss: 42071.025604
Feature norm: 100.145390
Error norm: 101.029342
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.366

***** Iteration #268 *****
Loss: 42069.871873
Feature norm: 100.243865
Error norm: 127.373076
Active features: 127364
Line search trials: 1
Line search step: 1.

***** Iteration #306 *****
Loss: 42053.262758
Feature norm: 100.945423
Error norm: 41.093094
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.331

***** Iteration #307 *****
Loss: 42052.986076
Feature norm: 100.969472
Error norm: 49.606313
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.330

***** Iteration #308 *****
Loss: 42052.710543
Feature norm: 101.003622
Error norm: 63.941704
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.340

***** Iteration #309 *****
Loss: 42052.419765
Feature norm: 101.020500
Error norm: 48.280124
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.328

***** Iteration #310 *****
Loss: 42052.247818
Feature norm: 101.025136
Error norm: 59.637004
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #347 *****
Loss: 42047.444943
Feature norm: 101.344669
Error norm: 24.598810
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.337

***** Iteration #348 *****
Loss: 42047.324480
Feature norm: 101.346080
Error norm: 26.421913
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.337

***** Iteration #349 *****
Loss: 42047.227732
Feature norm: 101.352656
Error norm: 47.340847
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.329

***** Iteration #350 *****
Loss: 42047.118243
Feature norm: 101.351605
Error norm: 28.993784
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.346

***** Iteration #351 *****
Loss: 42047.064687
Feature norm: 101.350968
Error norm: 24.935599
Active features: 127364
Line search trials: 1
Line search step: 1.0

In [26]:
predicted_tags = model.tag_sents(val_tokens)
get_f1_scores(val_set, predicted_tags)

F1 score for Class: DNA = 0.6997802197802196
F1 score for Class: protein = 0.8212576332728172
F1 score for Class: cell_type = 0.7497975708502024
F1 score for Class: cell_line = 0.7130242825607065
F1 score for Class: RNA = 0.680672268907563
Macro averaged F1 score for all classes: 0.7329063950743018


In [None]:
# Redo POS_Based_Tagger model but using NEXT_PREV WORD FEATURES (just reinstantiate the class :))
