In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [26]:
import codecs
import numpy as np
import nltk
import pycrfsuite
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, auc, precision_score, recall_score, roc_auc_score, roc_curve
import pickle

In [3]:

# data_bio = []


def split_on_empty_lines(s):
    myarray = s.split("\n\n")
    for e in myarray:
        # print(e)
        l = e.split("\n")
#         print(l)
        l = [(e.split("\t")[0], e.split("\t")[1]) for e in l if e not in '']
#         print(l)
        data_bio.append(l)
    #docs = [[((l[0]).split("\n").split("	")[0],(l[0]).split("\n").split("	")[1]) for l in myarray ]]
    return data_bio

# docs = []

def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

# A function for extracting features in documents


def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document


def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [4]:
with open('complex_rules_iob_input.pkl', 'rb') as f:
    data = pickle.load(f)

In [5]:
print(len(data))

1338


In [6]:
print(data)

[[('Frozen', 'JJ', 'O'), ('sections', 'NNS', 'O'), ('of', 'IN', 'O'), ('various', 'JJ', 'O'), ('green', 'JJ', 'O'), ('fluorescent', 'JJ', 'O'), ('protein', 'NN', 'O'), ('transgenic', 'JJ', 'O'), ('mouse', 'NN', 'O'), ('heads', 'NNS', 'O'), ('were', 'VBD', 'O'), ('prepared', 'VBN', 'O'), ('using', 'VBG', 'O'), ('the', 'DT', 'O'), ('film', 'NN', 'B-method'), ('method', 'NN', 'I-method'), ('developed', 'VBN', 'O'), ('by', 'IN', 'O'), ('Kawamoto', 'NNP', 'O'), ('and', 'CC', 'O'), ('Shimizu', 'NNP', 'O'), ('.', '.', 'O')], [('For', 'IN', 'O'), ('example', 'NN', 'O'), (',', ',', 'O'), ('the', 'DT', 'O'), ('film', 'NN', 'B-method'), ('method', 'NN', 'I-method'), ('is', 'VBZ', 'O'), ('useful', 'JJ', 'O'), ('for', 'IN', 'O'), ('chasing', 'VBG', 'O'), ('donor', 'NN', 'O'), ('cells', 'NNS', 'O'), ('that', 'WDT', 'O'), ('are', 'VBP', 'O'), ('labeled', 'VBN', 'O'), ('with', 'IN', 'O'), ('GFP', 'NN', 'O'), ('in', 'IN', 'O'), ('an', 'DT', 'O'), ('acceptor', 'NN', 'O'), ('body', 'NN', 'O'), ('.', '.',

In [7]:
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [8]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')


# Generate predictions
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
# for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
#     print("%s (%s)" % (y, x))

# Create a mapping of labels to indices
#labels = {"N": 1, "I": 0}

labels = {"B-method": 2, "I-method": 1, "O": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])


# Print out the classification report
print(classification_report(
    truths, predictions, digits=9,
    # target_names=["I", "N"]))
    
    target_names=["O", "I-method", "B-method"]))

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 22330
Seconds required: 0.172

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 14879.838743
Feature norm: 1.000000
Error norm: 11243.494551
Active features: 21878
Line search trials: 1
Line search step: 0.000014
Seconds required for this iteration: 0.081

***** Iteration #2 *****
Loss: 13556.246199
Feature norm: 0.883960
Error norm: 9037.204159
Active features: 20969
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.050

***** Iteration #3 *****
Loss: 12057.299636
Feature norm: 0.704829
Error norm: 5007.466824
Active features: 20697
Line search trials: 2
Line search step: 0.500000
Seconds required for t

***** Iteration #45 *****
Loss: 387.782233
Feature norm: 37.333286
Error norm: 177.023815
Active features: 5724
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #46 *****
Loss: 383.495765
Feature norm: 38.009633
Error norm: 127.055860
Active features: 5570
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.030

***** Iteration #47 *****
Loss: 379.775478
Feature norm: 38.466262
Error norm: 94.941726
Active features: 5454
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.031

***** Iteration #48 *****
Loss: 376.016149
Feature norm: 39.171682
Error norm: 140.948385
Active features: 5273
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.030

***** Iteration #49 *****
Loss: 373.617325
Feature norm: 39.722729
Error norm: 132.923623
Active features: 5122
Line search trials: 1
Line search step: 1.000000
Seconds required for

***** Iteration #99 *****
Loss: 342.371354
Feature norm: 45.908207
Error norm: 16.903928
Active features: 3213
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #100 *****
Loss: 342.111488
Feature norm: 46.019002
Error norm: 92.242400
Active features: 3170
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.031

***** Iteration #101 *****
Loss: 341.657910
Feature norm: 46.049219
Error norm: 55.871755
Active features: 3156
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #102 *****
Loss: 341.404190
Feature norm: 46.082758
Error norm: 11.681894
Active features: 3159
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #103 *****
Loss: 341.170422
Feature norm: 46.100391
Error norm: 34.244481
Active features: 3144
Line search trials: 1
Line search step: 1.000000
Seconds required for

***** Iteration #148 *****
Loss: 332.622501
Feature norm: 45.821056
Error norm: 6.729076
Active features: 2645
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.032

***** Iteration #149 *****
Loss: 332.538454
Feature norm: 45.800007
Error norm: 27.701628
Active features: 2620
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #150 *****
Loss: 332.461355
Feature norm: 45.785050
Error norm: 31.892763
Active features: 2627
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.038

***** Iteration #151 *****
Loss: 332.379804
Feature norm: 45.780734
Error norm: 14.781741
Active features: 2626
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.030

***** Iteration #152 *****
Loss: 332.305346
Feature norm: 45.765997
Error norm: 16.402920
Active features: 2631
Line search trials: 1
Line search step: 1.000000
Seconds required for

***** Iteration #190 *****
Loss: 330.497203
Feature norm: 45.859462
Error norm: 23.392466
Active features: 2512
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.037

***** Iteration #191 *****
Loss: 330.455829
Feature norm: 45.873465
Error norm: 16.624041
Active features: 2507
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.053

***** Iteration #192 *****
Loss: 330.418036
Feature norm: 45.881247
Error norm: 24.222564
Active features: 2504
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.053

***** Iteration #193 *****
Loss: 330.373016
Feature norm: 45.889772
Error norm: 14.533692
Active features: 2503
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.054

***** Iteration #194 *****
Loss: 330.336426
Feature norm: 45.899079
Error norm: 21.389566
Active features: 2500
Line search trials: 2
Line search step: 0.500000
Seconds required fo

In [9]:
trainer.logparser.iterations

[{'num': 1,
  'scores': {},
  'loss': 14879.838743,
  'feature_norm': 1.0,
  'error_norm': 11243.494551,
  'active_features': 21878,
  'linesearch_trials': 1,
  'linesearch_step': 1.4e-05,
  'time': 0.081},
 {'num': 2,
  'scores': {},
  'loss': 13556.246199,
  'feature_norm': 0.88396,
  'error_norm': 9037.204159,
  'active_features': 20969,
  'linesearch_trials': 1,
  'linesearch_step': 1.0,
  'time': 0.05},
 {'num': 3,
  'scores': {},
  'loss': 12057.299636,
  'feature_norm': 0.704829,
  'error_norm': 5007.466824,
  'active_features': 20697,
  'linesearch_trials': 2,
  'linesearch_step': 0.5,
  'time': 0.087},
 {'num': 4,
  'scores': {},
  'loss': 11334.921896,
  'feature_norm': 0.773699,
  'error_norm': 4668.352748,
  'active_features': 22077,
  'linesearch_trials': 1,
  'linesearch_step': 1.0,
  'time': 0.037},
 {'num': 5,
  'scores': {},
  'loss': 9013.571654,
  'feature_norm': 5.13446,
  'error_norm': 31298.414985,
  'active_features': 18695,
  'linesearch_trials': 1,
  'linesearc

In [10]:
import re

# testing = '''In this paper we have first presented a simple approach to extract fine-grained method sentences from large scientific corpora. We have also explored two established techniques to automatically extract method terminologies from method sentences. Our results showed that we can extract most of these terms using simple grammatical patterns. A few other terms can be extracted with machine learning techniques. A brief study of the corpus showed that the context of the method mentions can help in the extraction of important information about the method term. Our future work will then be to use the whole corpus to extract such information that is essential in the building of NLP resources such as glossaries, ontologies and specialist lexicons.'''
testing = '''ELMo is a deep contextualized word representation that models both (1) complex characteristics of word use (e.g., syntax and semantics), and (2) how these uses vary across linguistic contexts (i.e., to model polysemy). These word vectors are learned functions of the internal states of a deep bidirectional language model (biLM), which is pre-trained on a large text corpus. They can be easily added to existing models and significantly improve the state of the art across a broad range of challenging NLP problems, including question answering, textual entailment and sentiment analysis.'''
# testing = '''Scientific publications contain many references to method terminologies used during scientific experiments. New terms are constantly created within the research community, especially in the biomedical domain where thousands of papers are published each week. In this study we report our attempt to automatically extract such method terminologies from scientific research papers, using rule-based and machine learning techniques. We first used some linguistic features to extract fine-grained method sentences from a large biomedical corpus and then applied well established methodologies to extract the method terminologies. We focus the present study on the extraction of method phrases that contain an explicit mention of method keywords such as (algorithm, technique, analysis, approach and method) and other less explicit method terms such as Multiplex Ligation dependent Probe Amplification. Our initial results show an average F-score of 91.89 for the rule-based system and 78.26 for the Conditional Random Field-based machine learning system.'''
testing = re.sub(r'\[[^)]*\]|\([^)]*\)', '', testing)
testing = re.sub(r'[<>]*', '', testing)
testing = re.sub(' +', ' ', testing)
print(testing)

ELMo is a deep contextualized word representation that models both complex characteristics of word use , and how these uses vary across linguistic contexts . These word vectors are learned functions of the internal states of a deep bidirectional language model , which is pre-trained on a large text corpus. They can be easily added to existing models and significantly improve the state of the art across a broad range of challenging NLP problems, including question answering, textual entailment and sentiment analysis.


In [11]:
X_list = [x for x in [word.strip().replace(".", "").replace(",", "") for word in testing.split(" ")] if x not in ['', '.\n', ',']]

print(X_list)

test_input = extract_features(nltk.pos_tag(X_list))

for x, y in zip(X_list, tagger.tag(test_input)):
    print(x, y)


['ELMo', 'is', 'a', 'deep', 'contextualized', 'word', 'representation', 'that', 'models', 'both', 'complex', 'characteristics', 'of', 'word', 'use', 'and', 'how', 'these', 'uses', 'vary', 'across', 'linguistic', 'contexts', 'These', 'word', 'vectors', 'are', 'learned', 'functions', 'of', 'the', 'internal', 'states', 'of', 'a', 'deep', 'bidirectional', 'language', 'model', 'which', 'is', 'pre-trained', 'on', 'a', 'large', 'text', 'corpus', 'They', 'can', 'be', 'easily', 'added', 'to', 'existing', 'models', 'and', 'significantly', 'improve', 'the', 'state', 'of', 'the', 'art', 'across', 'a', 'broad', 'range', 'of', 'challenging', 'NLP', 'problems', 'including', 'question', 'answering', 'textual', 'entailment', 'and', 'sentiment', 'analysis']
ELMo O
is O
a O
deep O
contextualized O
word O
representation O
that O
models O
both O
complex O
characteristics O
of O
word O
use O
and O
how O
these O
uses O
vary O
across O
linguistic O
contexts O
These O
word O
vectors O
are O
learned O
functions

In [12]:
print(classification_report(
    truths, predictions,
    # target_names=["I", "N"]))
    
    target_names=["O", "I-method", "B-method"]))

              precision    recall  f1-score   support

           O       0.98      0.98      0.98      2988
    I-method       0.84      0.84      0.84       267
    B-method       0.79      0.83      0.81       131

    accuracy                           0.96      3386
   macro avg       0.87      0.88      0.88      3386
weighted avg       0.96      0.96      0.96      3386



In [33]:
print(confusion_matrix(truths, predictions))

[[2918   43   27]
 [  40  225    2]
 [  21    1  109]]


In [32]:
print(accuracy_score(truths, predictions))

0.9604252805670408


In [13]:
output_list = []
for i, pred, true in zip(X_test, y_pred, y_test):
    for j in range(len(i)):
#         print(list(i[j][1].split("="))[-1] + '\t\t' + pred[j] + '\t\t' + true[j])
        output_list.append([list(i[j][1].split("="))[-1], pred[j], true[j], 1 if pred[j] != true[j] and true[j] == "O" else "", 1 if pred[j] != true[j] and pred[j] == "O" else ""])
#         for k in j:
#             print(k)

print("{: >20} {: >20} {: >20} {: >20} {: >20}".format('Word', 'Prediction', 'True', 'New', "Missed"))
print("="*110)
for row in output_list:
    print("{: >20} {: >20} {: >20} {: >20} {: >20}".format(*row))

                Word           Prediction                 True                  New               Missed
             instead                    O                    O                                          
                   ,                    O                    O                                          
                   a                    O                    O                                          
            weighted             B-method             B-method                                          
               least             I-method             I-method                                          
             squares             I-method             I-method                                          
          estimation             I-method             I-method                                          
              method             I-method             I-method                                          
           suggested                    O              

                gene             B-method                    O                    1                     
          expression             I-method                    O                    1                     
            analysis             I-method                    O                    1                     
                  on                    O                    O                                          
            synovial                    O                    O                                          
            biopsies                    O                    O                                          
                   ,                    O                    O                                          
                  we                    O                    O                                          
           evaluated                    O                    O                                          
                 and                    O              

                  to                    O                    O                                          
                 the                    O                    O                                          
                 bsi                    O                    O                                          
                 and                    O                    O                                          
                  on                    O                    O                                          
                 the                    O                    O                                          
                 day                    O                    O                                          
                  of                    O                    O                                          
                 bsi                    O                    O                                          
                were                    O              

                  of                    O                    O                                          
                 our                    O                    O                                          
            recently                    O                    O                                          
           published                    O                    O                                          
           composite                    O                    O                                          
             scoring                    O                    O                                          
            function                    O                    O                                          
               qmean                    O                    O                                          
                  in                    O                    O                                          
               order                    O              

                they                    O                    O                                          
                 are                    O                    O                                          
            observed                    O                    O                                          
                   .                    O                    O                                          
                  we                    O                    O                                          
                used                    O                    O                                          
                 the                    O                    O                                          
             surname                    O                    O                                          
                 and                    O                    O                                          
                 all                    O              

                even                    O                    O                                          
           different                    O                    O                                          
               types                    O                    O                                          
                  of                    O                    O                                          
         experiments                    O                    O                                          
                   ,                    O                    O                                          
                  as                    O                    O                                          
                long                    O                    O                                          
                  as                    O                    O                                          
                 the                    O              

                 the                    O                    O                                          
             reverse             B-method             B-method                                          
             micelle             I-method             I-method                                          
              method             I-method             I-method                                          
                   .                    O                    O                                          
             linkage                    O                    O                                          
      disequilibrium                    O                    O                                          
                 and                    O                    O                                          
           haplotype                    O             B-method                                         1
            analysis                    O             I

              method             I-method             I-method                                          
                   .                    O                    O                                          
                  in                    O                    O                                          
               order                    O                    O                                          
                  to                    O                    O                                          
         investigate                    O                    O                                          
                 the                    O                    O                                          
           long-term                    O                    O                                          
             effects                    O                    O                                          
                  of                    O              

In [14]:
TP = 0
TN = 0
FN = 0
FP = 0

# print(np.array(y_pred).flatten())
# flat_list = [item for sublist in t for item in sublist]

for i, j in zip([x for y in y_pred for x in y], [x for y in y_test for x in y]):
    if (i == j):
        if (j == "O"):
            TN+=1
        elif (j == 'B-method'):
            TP+=1
    else:
        if (j == "O"):
            FN+=1
        elif (j == 'B-method'):
            FP+=1
        
print("TP:", TP)
print("TN:", TN)
print("FN:", FN)
print("FP:", FP)

precision = TP / (TP + FP)
recall = TP / (TP + FN)

f = 2 * (precision * recall)/(precision + recall)

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1-Score:", f)

TP: 109
TN: 2918
FN: 70
FP: 22

Precision: 0.8320610687022901
Recall: 0.6089385474860335
F1-Score: 0.7032258064516129


In [15]:
for word in output_list:
    if (word[3] == 1):
        if (word[1] == 'B-method'):
            print('\n\n' + word[0], end=' ')
        else:
            print(word[0], end=' ')



maximum likelihood analysis 

bit score 

lightcycler™ second derivatives method 

finite-difference time-domain method 

population model 

penalized smoothing model 

criterion method 

i-tasser method 

tennessee health science center 

gene expression analysis 

2-δδct method 

working method 

dna quantification 

standard fixpoint analysis 

modified baecke questionnaire score 

sem algorithm 

typical computational analysis 

initial low resolution model 

spatial filtering approach 

established morphometric method 

dunnett 

control method 

sem algorithm 

cd-hit algorithm 

domssea method 

modified loess method 

l-q method 