# Import packages and load dataset

In [201]:
from datasets import load_dataset
import nltk as nltk
from nltk.tag import pos_tag
from nltk.tag import CRFTagger
import numpy as np
import re, unicodedata
nltk.download('averaged_perceptron_tagger')
dataset = load_dataset(
    "tner/bionlp2004", 
    cache_dir='./data_cache'
)

print('Our dataset is a dictionary that has {} splits: \n\n{}'.format(len(dataset),dataset))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Reusing dataset bio_nlp2004 (./data_cache\tner___bio_nlp2004\bionlp2004\1.0.0\9f41d3f0270b773c2762dee333ae36c29331e2216114a57081f77639fdb5e904)


  0%|          | 0/3 [00:00<?, ?it/s]

Our dataset is a dictionary that has 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 16619
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1927
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 3856
    })
})


# Formatting the dataset splits

In [202]:

train_sentences_ner = [item['tokens'] for item in dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['train']]

val_sentences_ner = [item['tokens'] for item in dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['validation']]

test_sentences_ner = [item['tokens'] for item in dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in dataset['test']]

In [203]:
print('Number of training sentences = {}'.format(len(train_sentences_ner)))
print('Number of validation sentences = {}'.format(len(val_sentences_ner)))
print('Number of test sentences = {}'.format(len(test_sentences_ner)))

Number of training sentences = 16619
Number of validation sentences = 1927
Number of test sentences = 3856


In [204]:
print('An instance from the training set looks like this: \n\n{}'.format(train_sentences_ner[101]))
print('Corresponding label: \n\n{}'.format(train_labels_ner[101]))

An instance from the training set looks like this: 

['Normal', 'T', 'lymphocytes', 'whose', 'surface', 'expression', 'of', 'CD3', 'was', 'depleted', 'showed', 'impaired', 'UV-induced', 'tyrosine', 'phosphorylation', 'and', 'Ca2+', 'signals', '.']
Corresponding label: 

['0', '5', '6', '0', '0', '0', '0', '3', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']


In [205]:
print('Number of unique labels: {}'.format(np.unique(np.concatenate(train_labels_ner))))

Number of unique labels: ['0' '1' '10' '2' '3' '4' '5' '6' '7' '8' '9']


In [206]:
# mapping from labels to the tags

all_labels = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

mapping = {value:key for key, value in all_labels.items()}
print(mapping)

{0: 'O', 1: 'B-DNA', 2: 'I-DNA', 3: 'B-protein', 4: 'I-protein', 5: 'B-cell_type', 6: 'I-cell_type', 7: 'B-cell_line', 8: 'I-cell_line', 9: 'B-RNA', 10: 'I-RNA'}


In [207]:
train_set = [list(zip(train_sentences_ner[index],[mapping[int(i)]for i in train_labels_ner[index]]))for index, sentence in enumerate(train_sentences_ner)]

val_set = [list(zip(val_sentences_ner[index],[mapping[int(i)]for i in val_labels_ner[index]]))for index, sentence in enumerate(val_sentences_ner)]
val_tokens = [tok for tok in val_sentences_ner]
val_tags = [[mapping[int(i)]for i in item] for item in val_labels_ner]

test_set = [list(zip(test_sentences_ner[index],[mapping[int(i)]for i in test_labels_ner[index]]))for index, sentence in enumerate(test_sentences_ner)]
test_tokens = [tok for tok in test_sentences_ner]

test_tags = [[mapping[int(i)]for i in item] for item in test_labels_ner]
print(val_set[0])

[('IL-2', 'B-DNA'), ('gene', 'I-DNA'), ('expression', 'O'), ('and', 'O'), ('NF-kappa', 'B-protein'), ('B', 'I-protein'), ('activation', 'O'), ('through', 'O'), ('CD28', 'B-protein'), ('requires', 'O'), ('reactive', 'O'), ('oxygen', 'O'), ('production', 'O'), ('by', 'O'), ('5-lipoxygenase', 'B-protein'), ('.', 'O')]


# Create plain CRF model and train it

In [208]:
model = CRFTagger(verbose= True)
model.train(train_set,'model.crf.my_tagger')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 41166
Seconds required: 0.272

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 670701.292937
Feature norm: 5.000000
Error norm: 153439.508663
Active features: 41166
Line search trials: 2
Line search step: 0.000016
Seconds required for this iteration: 0.777

***** Iteration #2 *****
Loss: 450069.016166
Feature norm: 3.656978
Error norm: 128870.363823
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.244

***** Iteration #3 *****
Loss: 371186.909816
Feature norm: 3.071075
Error norm: 51633.075776
Active features: 41166
Line search trials: 2
Line search step: 0.120890
Seconds 

***** Iteration #46 *****
Loss: 78460.630137
Feature norm: 56.704766
Error norm: 3746.629139
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.257

***** Iteration #47 *****
Loss: 78012.679385
Feature norm: 57.474117
Error norm: 2405.390143
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.255

***** Iteration #48 *****
Loss: 77703.264361
Feature norm: 58.123588
Error norm: 2954.226033
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.252

***** Iteration #49 *****
Loss: 77253.919317
Feature norm: 59.055096
Error norm: 2654.172693
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #50 *****
Loss: 76673.825721
Feature norm: 60.702887
Error norm: 3829.610735
Active features: 41166
Line search trials: 1
Line search step: 1.000000

***** Iteration #92 *****
Loss: 65403.235706
Feature norm: 88.752375
Error norm: 1086.827478
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.239

***** Iteration #93 *****
Loss: 65192.893147
Feature norm: 89.695897
Error norm: 877.756609
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.240

***** Iteration #94 *****
Loss: 65135.569269
Feature norm: 89.988836
Error norm: 1777.675853
Active features: 41166
Line search trials: 2
Line search step: 0.203936
Seconds required for this iteration: 0.482

***** Iteration #95 *****
Loss: 65023.211767
Feature norm: 90.607369
Error norm: 1461.181196
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.237

***** Iteration #96 *****
Loss: 64802.971478
Feature norm: 91.473481
Error norm: 788.982954
Active features: 41166
Line search trials: 1
Line search step: 1.000000
S

***** Iteration #133 *****
Loss: 61887.345229
Feature norm: 95.029603
Error norm: 547.603326
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.240

***** Iteration #134 *****
Loss: 61845.239357
Feature norm: 95.135241
Error norm: 524.488432
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.239

***** Iteration #135 *****
Loss: 61769.456042
Feature norm: 95.326148
Error norm: 481.065519
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.251

***** Iteration #136 *****
Loss: 61753.629992
Feature norm: 95.361149
Error norm: 1109.236315
Active features: 41166
Line search trials: 2
Line search step: 0.207128
Seconds required for this iteration: 0.488

***** Iteration #137 *****
Loss: 61717.559832
Feature norm: 95.411800
Error norm: 641.703506
Active features: 41166
Line search trials: 1
Line search step: 1.00000

***** Iteration #176 *****
Loss: 60800.038992
Feature norm: 98.678453
Error norm: 439.117297
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.237

***** Iteration #177 *****
Loss: 60790.540770
Feature norm: 98.640710
Error norm: 284.364438
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.234

***** Iteration #178 *****
Loss: 60773.674785
Feature norm: 98.592766
Error norm: 354.428553
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.242

***** Iteration #179 *****
Loss: 60761.266564
Feature norm: 98.625826
Error norm: 262.811699
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.244

***** Iteration #180 *****
Loss: 60747.916512
Feature norm: 98.783004
Error norm: 858.351229
Active features: 41166
Line search trials: 2
Line search step: 0.412630

***** Iteration #222 *****
Loss: 60459.391325
Feature norm: 102.050309
Error norm: 233.378051
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.238

***** Iteration #223 *****
Loss: 60453.261556
Feature norm: 102.067522
Error norm: 232.646803
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.238

***** Iteration #224 *****
Loss: 60449.043434
Feature norm: 102.118008
Error norm: 406.216460
Active features: 41166
Line search trials: 2
Line search step: 0.336267
Seconds required for this iteration: 0.476

***** Iteration #225 *****
Loss: 60441.311281
Feature norm: 102.164310
Error norm: 227.261232
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.239

***** Iteration #226 *****
Loss: 60436.957960
Feature norm: 102.189405
Error norm: 162.461860
Active features: 41166
Line search trials: 1
Line search step: 1.0

***** Iteration #262 *****
Loss: 60355.970817
Feature norm: 102.879153
Error norm: 101.724966
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.240

***** Iteration #263 *****
Loss: 60355.053187
Feature norm: 102.893330
Error norm: 110.454674
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.247

***** Iteration #264 *****
Loss: 60354.129653
Feature norm: 102.907557
Error norm: 93.404881
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #265 *****
Loss: 60352.972463
Feature norm: 102.944667
Error norm: 126.955501
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.244

***** Iteration #266 *****
Loss: 60351.982730
Feature norm: 102.962940
Error norm: 175.138001
Active features: 41166
Line search trials: 1
Line search step: 1.00

***** Iteration #303 *****
Loss: 60325.123871
Feature norm: 103.456510
Error norm: 63.736478
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.239

***** Iteration #304 *****
Loss: 60324.851892
Feature norm: 103.459429
Error norm: 76.610116
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #305 *****
Loss: 60324.210594
Feature norm: 103.472275
Error norm: 73.906701
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.238

***** Iteration #306 *****
Loss: 60323.920001
Feature norm: 103.469157
Error norm: 101.356914
Active features: 41166
Line search trials: 2
Line search step: 0.402116
Seconds required for this iteration: 0.483

***** Iteration #307 *****
Loss: 60323.575961
Feature norm: 103.472831
Error norm: 63.285687
Active features: 41166
Line search trials: 1
Line search step: 1.00000

***** Iteration #348 *****
Loss: 60314.399704
Feature norm: 103.580419
Error norm: 44.889223
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.245

***** Iteration #349 *****
Loss: 60314.317787
Feature norm: 103.581257
Error norm: 27.541101
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.244

***** Iteration #350 *****
Loss: 60314.250400
Feature norm: 103.583683
Error norm: 35.646477
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.244

***** Iteration #351 *****
Loss: 60314.139505
Feature norm: 103.586771
Error norm: 30.485182
Active features: 41166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #352 *****
Loss: 60314.108435
Feature norm: 103.599733
Error norm: 73.166664
Active features: 41166
Line search trials: 1
Line search step: 1.000000

# Predict on validation set

In [209]:
predicted_tags = model.tag_sents(val_tokens)

# Create functions for fetching the F1 scores

In [210]:
# Create DF to hold F1 scores
import pandas as pd
import numpy as np
from copy import copy
df = pd.DataFrame(columns=['DNA','protein ','cell_type ','cell_line ','RNA ', 'Macro Avg.'])
def append_to_scores_table(scores, model_name):
    # Append a placeholder for MacroAvg.
    scores.append(0)
    df.loc[model_name]= scores
    return
def append_macro_avg(score, model_name):
    df.loc[model_name]['Macro Avg.']= score
    return

In [211]:
def span_fill(index, token, label, spans,start,id_):
    # Check if beginning
    if 'B-' in label:
        start = index
        ending = index + 1
        named_entity_type = label[2:]
    # check if inside
    elif 'I-' in label:
        ending = index + 1
    # check if not entity type
    elif start >= 0 and label == 'O':
        if named_entity_type not in spans:
            spans[named_entity_type] = []
        spans[named_entity_type].append((start, ending, id_))
        start = -1   
    if start >= 0:    
        if named_entity_type not in spans:
            spans[named_entity_type] = []
        spans[named_entity_type].append((start, ending, id_))

In [212]:
def score_printer(named_entity_types, true_spans, predicted_spans, F1_score_for_each_class, model_name):
    
    # Manually calculating F1, precision, recall. 
    for named_entity_type in named_entity_types:
        # We loop through all the named entity tpes
        # set TP, FN, and FP to zero.
        true_positive = 0
        false_positive = 0
        false_negative = 0
        
        for span in true_spans[named_entity_type]:
            # check if current true span not in the predicted spans
            if span not in predicted_spans[named_entity_type]:
                # If so...increment false negative value.
                false_negative = false_negative + 1
        
        for span in predicted_spans[named_entity_type]:
            # check if current predicted span in the true spans
            if span in true_spans[named_entity_type]:
                # If so, increment true positive val
                true_positive = true_positive + 1
            else:
                # otherwise increment false negative val
                false_positive = false_positive + 1       
        
            
        if true_positive + false_negative== 0:
            # set recall
            recall = 0
        else:
            # calculate recall using TP and FN
            recall = true_positive / float(true_positive + false_negative)

            
        if true_positive + false_positive == 0:
            # Set precision
            precision = 0
        else:
            # calculate precision using FP and TP
            precision = true_positive / float(false_positive + true_positive)
            

        if recall + precision == 0:
            # Set F1 score
            F1 = 0
        else:
            # Calculate F1 using precision and recall
            F1 = 2 * precision * recall / (precision + recall)
            

        F1_score_for_each_class.append(F1)
        print('F1 score for Class: {} = {}'.format(named_entity_type, F1))
        
    macro_avrg = copy(np.mean(F1_score_for_each_class))
    print('Macro averaged F1 score for all classes: {}'.format(np.mean(F1_score_for_each_class)))
    append_to_scores_table(F1_score_for_each_class, model_name)
    append_macro_avg(macro_avrg, model_name)

In [213]:

def get_spans(tagged_sentences):
    # Create a dict to hold spans
    spans_dict = {}   
    for id_, sentence in enumerate(tagged_sentences):
        start = -1
        entity_type = None
        for index, (token, label) in enumerate(sentence):
            span_fill(index, token, label, spans_dict, start, id_)  
    return spans_dict

def get_f1_scores(test_sents, test_sents_with_pred, model_name):
    true_spans = get_spans(test_sents)
    predicted_spans = get_spans(test_sents_with_pred)
    # A list to hold F1 scores
    F1_score_for_each_class = []
    # Set named entity types
    named_entity_types = true_spans.keys()
    
    score_printer(named_entity_types, true_spans, predicted_spans, F1_score_for_each_class, model_name)
    

# Get F1 scores for validation set on plain model

In [214]:
get_f1_scores(val_set, predicted_tags, 'Plain model')

F1 score for Class: DNA = 0.6493860845839017
F1 score for Class: protein = 0.7885906040268457
F1 score for Class: cell_type = 0.6825657894736842
F1 score for Class: cell_line = 0.6267605633802816
F1 score for Class: RNA = 0.7008547008547009
Macro averaged F1 score for all classes: 0.6896315484638829


In [215]:
df

Unnamed: 0,DNA,protein,cell_type,cell_line,RNA,Macro Avg.
Plain model,0.649386,0.788591,0.682566,0.626761,0.700855,0.689632


# Create new version of the model that uses previous and next words as additional features

In [216]:
class Current_next_previous_word_CRFTagger(CRFTagger):
    def _get_features(self, toks, i):
            tok = toks[i]
            # Get features from original method
            features = super()._get_features(toks,i)
            # Append the current word
            features.append("CURRENT_WORD" + tok)
            if i < len(toks)-1:
                # Append the next word
                features.append("NEXT_WORD_" + toks[i+1])
                # Append the previous word
            if i > 0:
                features.append("PREVIOUS_WORD_" + toks[i-1])
            return features

# Train the new model

In [217]:
multi_word_model = Current_next_previous_word_CRFTagger(verbose=True)
multi_word_model.train(train_set, 'model.crf.next_previous_word_CRFTagger')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 127101
Seconds required: 0.565

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 666989.591094
Feature norm: 5.000000
Error norm: 155238.090407
Active features: 127101
Line search trials: 2
Line search step: 0.000016
Seconds required for this iteration: 0.880

***** Iteration #2 *****
Loss: 445965.511750
Feature norm: 3.695703
Error norm: 131668.452309
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.307

***** Iteration #3 *****
Loss: 361830.154566
Feature norm: 3.111198
Error norm: 59014.424742
Active features: 127101
Line search trials: 2
Line search step: 0.121885
Seco

***** Iteration #42 *****
Loss: 63751.525001
Feature norm: 52.462114
Error norm: 3094.736840
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.318

***** Iteration #43 *****
Loss: 62740.297617
Feature norm: 52.414545
Error norm: 3956.276671
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.296

***** Iteration #44 *****
Loss: 61834.084526
Feature norm: 53.647909
Error norm: 2455.387590
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.291

***** Iteration #45 *****
Loss: 61319.144302
Feature norm: 54.464859
Error norm: 2925.153843
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.325

***** Iteration #46 *****
Loss: 61116.589214
Feature norm: 54.841059
Error norm: 4299.802011
Active features: 127101
Line search trials: 1
Line search step: 1.0

***** Iteration #82 *****
Loss: 48577.208498
Feature norm: 79.450437
Error norm: 1869.078942
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #83 *****
Loss: 48328.717146
Feature norm: 80.269488
Error norm: 1012.363030
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.303

***** Iteration #84 *****
Loss: 48211.720155
Feature norm: 80.448797
Error norm: 862.440792
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #85 *****
Loss: 47973.868164
Feature norm: 80.967729
Error norm: 948.554381
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.286

***** Iteration #86 *****
Loss: 47854.256319
Feature norm: 81.380709
Error norm: 1478.740602
Active features: 127101
Line search trials: 2
Line search step: 0.475

***** Iteration #123 *****
Loss: 44476.021025
Feature norm: 96.070226
Error norm: 593.807833
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.316

***** Iteration #124 *****
Loss: 44426.149999
Feature norm: 96.540905
Error norm: 572.883580
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.333

***** Iteration #125 *****
Loss: 44416.823354
Feature norm: 97.399830
Error norm: 1769.039134
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.349

***** Iteration #126 *****
Loss: 44329.561881
Feature norm: 97.530670
Error norm: 526.187259
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.301

***** Iteration #127 *****
Loss: 44303.316286
Feature norm: 97.564329
Error norm: 386.365697
Active features: 127101
Line search trials: 1
Line search step: 1.

***** Iteration #162 *****
Loss: 43450.848985
Feature norm: 103.650184
Error norm: 619.962296
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.303

***** Iteration #163 *****
Loss: 43437.272412
Feature norm: 103.639040
Error norm: 508.061434
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.308

***** Iteration #164 *****
Loss: 43417.675906
Feature norm: 103.677295
Error norm: 308.588769
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.287

***** Iteration #165 *****
Loss: 43407.253293
Feature norm: 103.701966
Error norm: 316.506016
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.288

***** Iteration #166 *****
Loss: 43397.644774
Feature norm: 103.790314
Error norm: 964.329463
Active features: 127101
Line search trials: 1
Line search step

***** Iteration #204 *****
Loss: 43088.657958
Feature norm: 102.854603
Error norm: 396.127647
Active features: 127101
Line search trials: 2
Line search step: 0.223829
Seconds required for this iteration: 0.664

***** Iteration #205 *****
Loss: 43084.483848
Feature norm: 102.820240
Error norm: 216.053040
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.310

***** Iteration #206 *****
Loss: 43081.345573
Feature norm: 102.797648
Error norm: 141.804733
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.320

***** Iteration #207 *****
Loss: 43077.068530
Feature norm: 102.763277
Error norm: 168.337060
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.333

***** Iteration #208 *****
Loss: 43073.366386
Feature norm: 102.726464
Error norm: 192.991275
Active features: 127101
Line search trials: 1
Line search step

***** Iteration #244 *****
Loss: 42997.371643
Feature norm: 102.214213
Error norm: 71.221154
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.301

***** Iteration #245 *****
Loss: 42996.423006
Feature norm: 102.221303
Error norm: 154.311368
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.306

***** Iteration #246 *****
Loss: 42996.004348
Feature norm: 102.226862
Error norm: 217.894356
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.297

***** Iteration #247 *****
Loss: 42995.349589
Feature norm: 102.225470
Error norm: 88.763764
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.313

***** Iteration #248 *****
Loss: 42994.928914
Feature norm: 102.225232
Error norm: 74.016274
Active features: 127101
Line search trials: 1
Line search step: 1

***** Iteration #285 *****
Loss: 42974.844239
Feature norm: 102.608697
Error norm: 55.305033
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.306

***** Iteration #286 *****
Loss: 42974.628476
Feature norm: 102.615083
Error norm: 62.484925
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.318

***** Iteration #287 *****
Loss: 42974.415229
Feature norm: 102.622564
Error norm: 151.329165
Active features: 127101
Line search trials: 2
Line search step: 0.467578
Seconds required for this iteration: 0.614

***** Iteration #288 *****
Loss: 42973.991939
Feature norm: 102.638787
Error norm: 91.965616
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.305

***** Iteration #289 *****
Loss: 42973.634836
Feature norm: 102.654112
Error norm: 46.627858
Active features: 127101
Line search trials: 1
Line search step: 1.

***** Iteration #326 *****
Loss: 42965.597098
Feature norm: 103.115784
Error norm: 69.056007
Active features: 127101
Line search trials: 2
Line search step: 0.469940
Seconds required for this iteration: 0.596

***** Iteration #327 *****
Loss: 42965.434741
Feature norm: 103.134514
Error norm: 34.670324
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.292

***** Iteration #328 *****
Loss: 42965.362206
Feature norm: 103.140251
Error norm: 27.534877
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.298

***** Iteration #329 *****
Loss: 42965.269294
Feature norm: 103.147487
Error norm: 65.630176
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.307

***** Iteration #330 *****
Loss: 42965.144693
Feature norm: 103.160260
Error norm: 32.908213
Active features: 127101
Line search trials: 1
Line search step: 1.0

***** Iteration #367 *****
Loss: 42962.521362
Feature norm: 103.551872
Error norm: 20.493797
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.298

***** Iteration #368 *****
Loss: 42962.503315
Feature norm: 103.548786
Error norm: 15.524593
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.293

***** Iteration #369 *****
Loss: 42962.467902
Feature norm: 103.547939
Error norm: 19.830327
Active features: 127101
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.298

***** Iteration #370 *****
Loss: 42962.449673
Feature norm: 103.552066
Error norm: 22.093607
Active features: 127101
Line search trials: 2
Line search step: 0.285318
Seconds required for this iteration: 0.586

L-BFGS terminated with the stopping criteria
Total seconds required for training: 123.211

Storing the model
Number of active features: 127101 (127101)
Number o

# Get F1 score for validation set predictions on the new model

In [218]:
predicted_tags = multi_word_model.tag_sents(val_tokens)
get_f1_scores(val_set, predicted_tags,'Prev-Next-WRD-Model')
df

F1 score for Class: DNA = 0.692885550154662
F1 score for Class: protein = 0.8195902048975513
F1 score for Class: cell_type = 0.7516233766233765
F1 score for Class: cell_line = 0.7122381477398015
F1 score for Class: RNA = 0.6942148760330579
Macro averaged F1 score for all classes: 0.7341104310896898


Unnamed: 0,DNA,protein,cell_type,cell_line,RNA,Macro Avg.
Plain model,0.649386,0.788591,0.682566,0.626761,0.700855,0.689632
Prev-Next-WRD-Model,0.692886,0.81959,0.751623,0.712238,0.694215,0.73411


# Create a third model that also uses parts of speech tags in addition to all the other features

In [219]:
class POSBasedTagger(Current_next_previous_word_CRFTagger):
    _tokens = None
    def _get_features(self, toks, i):
        # Adding POS tags as a feature on top of the current features
        features = super()._get_features(toks,i)
        # Set Pos tagged toks
        if toks != self._tokens:
            self._pos_tagged_toks = pos_tag(toks)
            self._tokens = toks
        features.append(self._pos_tagged_toks[i][1])
        return features

# Instantiate the model and train it

In [220]:
POSmodel = POSBasedTagger(verbose=True)
POSmodel.train(train_set, 'model.crf.POS_Based_Tagger')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 127364
Seconds required: 0.584

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 671004.167120
Feature norm: 5.000000
Error norm: 163790.150264
Active features: 127364
Line search trials: 2
Line search step: 0.000015
Seconds required for this iteration: 1.133

***** Iteration #2 *****
Loss: 440628.381558
Feature norm: 3.782112
Error norm: 141846.807427
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.380

***** Iteration #3 *****
Loss: 342479.559332
Feature norm: 3.219105
Error norm: 77815.878288
Active features: 127364
Line search trials: 2
Line search step: 0.123463
Seco

***** Iteration #42 *****
Loss: 61181.051018
Feature norm: 50.185046
Error norm: 3548.165591
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.379

***** Iteration #43 *****
Loss: 60623.362022
Feature norm: 51.238362
Error norm: 3460.970136
Active features: 127364
Line search trials: 2
Line search step: 0.422804
Seconds required for this iteration: 0.796

***** Iteration #44 *****
Loss: 60107.218050
Feature norm: 51.969337
Error norm: 2166.926008
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.396

***** Iteration #45 *****
Loss: 59592.053921
Feature norm: 52.970077
Error norm: 2419.057967
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.390

***** Iteration #46 *****
Loss: 59549.687104
Feature norm: 53.801044
Error norm: 9987.253825
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #85 *****
Loss: 47928.653919
Feature norm: 85.976508
Error norm: 2171.435326
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.376

***** Iteration #86 *****
Loss: 47772.232253
Feature norm: 85.475919
Error norm: 1220.153968
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.387

***** Iteration #87 *****
Loss: 47715.252325
Feature norm: 85.249474
Error norm: 1827.690337
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.392

***** Iteration #88 *****
Loss: 47646.586461
Feature norm: 85.290605
Error norm: 835.448268
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.394

***** Iteration #89 *****
Loss: 47565.493581
Feature norm: 85.559154
Error norm: 930.981015
Active features: 127364
Line search trials: 1
Line search step: 1.000

***** Iteration #128 *****
Loss: 43800.261828
Feature norm: 95.802885
Error norm: 708.627461
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.385

***** Iteration #129 *****
Loss: 43731.365458
Feature norm: 95.840541
Error norm: 634.824578
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.377

***** Iteration #130 *****
Loss: 43698.744421
Feature norm: 95.866357
Error norm: 1909.905439
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.387

***** Iteration #131 *****
Loss: 43657.447697
Feature norm: 95.883599
Error norm: 839.181876
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.377

***** Iteration #132 *****
Loss: 43640.182412
Feature norm: 95.882606
Error norm: 644.640659
Active features: 127364
Line search trials: 1
Line search step: 1.

***** Iteration #168 *****
Loss: 42646.162472
Feature norm: 95.721721
Error norm: 768.230990
Active features: 127364
Line search trials: 2
Line search step: 0.448874
Seconds required for this iteration: 0.781

***** Iteration #169 *****
Loss: 42616.795155
Feature norm: 95.715142
Error norm: 391.901616
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.385

***** Iteration #170 *****
Loss: 42600.238541
Feature norm: 95.711071
Error norm: 307.181577
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.399

***** Iteration #171 *****
Loss: 42594.966952
Feature norm: 95.708931
Error norm: 663.489332
Active features: 127364
Line search trials: 2
Line search step: 0.242494
Seconds required for this iteration: 0.764

***** Iteration #172 *****
Loss: 42584.800672
Feature norm: 95.706003
Error norm: 444.256970
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #211 *****
Loss: 42213.762731
Feature norm: 96.546959
Error norm: 150.240947
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.376

***** Iteration #212 *****
Loss: 42210.142617
Feature norm: 96.559566
Error norm: 148.488808
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.386

***** Iteration #213 *****
Loss: 42206.923343
Feature norm: 96.599794
Error norm: 262.438390
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.392

***** Iteration #214 *****
Loss: 42203.741377
Feature norm: 96.699044
Error norm: 447.661965
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.387

***** Iteration #215 *****
Loss: 42200.148865
Feature norm: 96.719900
Error norm: 200.562949
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #255 *****
Loss: 42085.573549
Feature norm: 99.377044
Error norm: 216.755391
Active features: 127364
Line search trials: 2
Line search step: 0.191377
Seconds required for this iteration: 0.771

***** Iteration #256 *****
Loss: 42084.331196
Feature norm: 99.437014
Error norm: 150.979276
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.366

***** Iteration #257 *****
Loss: 42082.408019
Feature norm: 99.527657
Error norm: 95.705340
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.382

***** Iteration #258 *****
Loss: 42080.450900
Feature norm: 99.639281
Error norm: 93.286887
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.393

***** Iteration #259 *****
Loss: 42079.169663
Feature norm: 99.725939
Error norm: 135.265756
Active features: 127364
Line search trials: 1
Line search step: 1.000

***** Iteration #303 *****
Loss: 42053.759083
Feature norm: 100.904777
Error norm: 66.303033
Active features: 127364
Line search trials: 2
Line search step: 0.387709
Seconds required for this iteration: 0.770

***** Iteration #304 *****
Loss: 42053.681536
Feature norm: 100.911179
Error norm: 51.769993
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.398

***** Iteration #305 *****
Loss: 42053.462321
Feature norm: 100.929926
Error norm: 34.767297
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.372

***** Iteration #306 *****
Loss: 42053.262758
Feature norm: 100.945423
Error norm: 41.093094
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.392

***** Iteration #307 *****
Loss: 42052.986076
Feature norm: 100.969472
Error norm: 49.606313
Active features: 127364
Line search trials: 1
Line search step: 1.0

***** Iteration #344 *****
Loss: 42047.645496
Feature norm: 101.346719
Error norm: 32.053377
Active features: 127364
Line search trials: 2
Line search step: 0.225517
Seconds required for this iteration: 0.768

***** Iteration #345 *****
Loss: 42047.614947
Feature norm: 101.345228
Error norm: 23.848478
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.389

***** Iteration #346 *****
Loss: 42047.542563
Feature norm: 101.344096
Error norm: 19.795423
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.379

***** Iteration #347 *****
Loss: 42047.444943
Feature norm: 101.344669
Error norm: 24.598810
Active features: 127364
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.382

***** Iteration #348 *****
Loss: 42047.324480
Feature norm: 101.346080
Error norm: 26.421913
Active features: 127364
Line search trials: 1
Line search step: 1.0

# Predict on validation set and get F1 scores

In [221]:
predicted_tags = POSmodel.tag_sents(val_tokens)
get_f1_scores(val_set, predicted_tags,'POS_model')
df

F1 score for Class: DNA = 0.6997802197802196
F1 score for Class: protein = 0.8212576332728172
F1 score for Class: cell_type = 0.7497975708502024
F1 score for Class: cell_line = 0.7130242825607065
F1 score for Class: RNA = 0.680672268907563
Macro averaged F1 score for all classes: 0.7329063950743018


Unnamed: 0,DNA,protein,cell_type,cell_line,RNA,Macro Avg.
Plain model,0.649386,0.788591,0.682566,0.626761,0.700855,0.689632
Prev-Next-WRD-Model,0.692886,0.81959,0.751623,0.712238,0.694215,0.73411
POS_model,0.69978,0.821258,0.749798,0.713024,0.680672,0.732906


# Use the best performing model to predict on unseen (test) data for generalisation

In [224]:
# Create new DF for best performing model on Test Set.
df = pd.DataFrame(columns=['DNA','protein ','cell_type ','cell_line ','RNA ', 'Macro Avg.'])
# This model performs the best, so let's choose it to predict on unseen (test data) split to see how well it generalises.
predicted_tags = multi_word_model.tag_sents(test_tokens)
get_f1_scores(test_set, predicted_tags,'Prev-Next-WRD-Model')
df

F1 score for Class: protein = 0.747716357472455
F1 score for Class: cell_type = 0.7242275483684666
F1 score for Class: DNA = 0.6802383316782523
F1 score for Class: cell_line = 0.5816733067729084
F1 score for Class: RNA = 0.6493506493506493
Macro averaged F1 score for all classes: 0.6766412387285463


Unnamed: 0,DNA,protein,cell_type,cell_line,RNA,Macro Avg.
Prev-Next-WRD-Model,0.747716,0.724228,0.680238,0.581673,0.649351,0.676641
