In [1]:
# Code to evaluate EPMC annotation to the Manual Annotation
# Code to compare results from ML methods and EPMC annotation

# (c) EMBL-EBI, September 2019
#
# Started: 23 Septmember  2019
# Updated: 24 Septmember  2019

_author_ = 'Santosh Tirunagari'

import os
import pandas as pd
import glob
import json
import csv
import sys

import multiprocessing

import numpy as np
import re

from nltk.tokenize import wordpunct_tokenize

import requests
# from pprint import pprint
import pandas as pd

from collections import defaultdict, Counter
import time
from requests.compat import urljoin

from tqdm import tqdm

In [2]:
EBI_data_folder = '/nfs/gns/literature/Santosh_Tirunagari/EBI standard Dataset/NER/'
test_df = pd.read_csv(EBI_data_folder+'test_text_format.csv', sep='\t', names=['Sentences', 'true_ner'])

In [3]:
from flair.models import TextClassifier
from flair.data import Sentence, Token
from flair.models import SequenceTagger

flair_models = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/'
# load the model you trained
gene_model = SequenceTagger.load(flair_models+'multi_bio_ner_model_gene/v01/'+'best-model.pt')
disease_model = SequenceTagger.load(flair_models+'multi_bio_ner_model_disease/'+'best-model.pt')
organisms_model = SequenceTagger.load(flair_models+'multi_bio_ner_model_organisms/v01/'+'best-model.pt')


2019-10-08 17:28:14,775 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model_gene/v01/best-model.pt
2019-10-08 17:28:20,508 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model_disease/best-model.pt
2019-10-08 17:28:22,906 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model_organisms/v01/best-model.pt


In [4]:
def tag_sentence(sentence):
    gene_sentence = Sentence(sentence)
    gene_model.predict(gene_sentence)

    disease_sentence = Sentence(sentence)
    disease_model.predict(disease_sentence)

    organisms_sentence = Sentence(sentence)
    organisms_model.predict(organisms_sentence)

    ml_target = gene_sentence.get_spans('ner')
    ml_disease = disease_sentence.get_spans('ner')
    ml_organism = organisms_sentence.get_spans('ner')
    
    return ml_target, ml_disease, ml_organism

def get_ents(ml_ent_type, ent_type):
    all_ents = []
    ents = re.findall(r'"(.*?)"', str(ml_ent_type))
    for each_ents in ents:
        all_ents.append([each_ents, ent_type])
    return all_ents


def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll))

    return results


def convert2IOB_dict(text_data,ner_tags):
    
    tokens = []
    ners = []
    
    split_text = wordpunct_tokenize(text_data)
    # for each word token append 'O'
    arr = ['O']*len(split_text)

    for each_tag in ner_tags:
        token_list = wordpunct_tokenize(each_tag[0])
        ner_list = wordpunct_tokenize(each_tag[1])

        if(len(token_list) > len(ner_list)):
            ner_list = len(token_list) * ner_list

        for i in range(0,len(ner_list)):
            # The logic here is look for the first B-tag and then append I-tag next
            if(i==0):
                ner_list[i] = 'B-'+ner_list[i]
            else:
                ner_list[i] = 'I-'+ner_list[i]

        tokens.append(token_list)
        ners.append(ner_list)
        
    for i in range(0, len(tokens)):
        spans = find_sub_list(tokens[i], split_text)
        for each_span in spans:
            arr[each_span[0]:each_span[1]] = ners[i]
    
    return zip(split_text, arr)

In [5]:
from nltk.tokenize import wordpunct_tokenize

result_path = '/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'

with open(result_path+'Public_annotated_test.csv','a',  newline='\n') as f1:
    public_writer=csv.writer(f1, delimiter='\t',lineterminator='\n')
    for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
#         get the text
        text = ' '.join(wordpunct_tokenize(row['Sentences']))
        ml_target, ml_disease, ml_organism = tag_sentence(text)
        
#         get the ner tags
        all_entities = []

        if ml_target:
            for each_entity in get_ents(ml_target,'GP'):
                all_entities.append(each_entity)
        if ml_disease:
            for each_entity in get_ents(ml_disease,'DS'):
                all_entities.append(each_entity)
        if ml_organism:
            for each_entity in get_ents(ml_organism,'OG'):
                all_entities.append(each_entity) 
     
        
        tagged_tokens = convert2IOB_dict(text,all_entities)
#         print(list(tagged_tokens))

        for each_word in tagged_tokens:
            public_writer.writerow(list(each_word))
        public_writer.writerow('') 
            


100%|██████████| 3197/3197 [4:24:08<00:00,  4.96s/it]   


In [10]:
from ast import literal_eval
ner = row['true_ner']
ner

"[[0, 11, 'Keratoconus', 'DS'], [67, 76, 'cytokines', 'GP']]"

In [12]:
[x[2:4] for x in eval(ner)]

[['Keratoconus', 'DS'], ['cytokines', 'GP']]

In [13]:
from nltk.tokenize import wordpunct_tokenize
from ast import literal_eval

result_path = '/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'

with open(result_path+'test_manual_annotated_on_public.csv','a',  newline='\n') as f1:
    public_writer=csv.writer(f1, delimiter='\t',lineterminator='\n')
    for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
#         get the text
        text = ' '.join(wordpunct_tokenize(row['Sentences']))
#         ml_target, ml_disease, ml_organism = tag_sentence(text)
        true_ner = [x[2:4] for x in eval(row['true_ner'])] 
        
        tagged_tokens = convert2IOB_dict(text,true_ner)
#         print(list(tagged_tokens))

        for each_word in tagged_tokens:
            public_writer.writerow(list(each_word))
        public_writer.writerow('') 

100%|██████████| 3197/3197 [00:01<00:00, 3036.32it/s]


In [14]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

true_lbs = pd.read_csv(result_path+'test_manual_annotated_on_public.csv', sep='\t', names=['tokens','tags'])
pred_lbs = pd.read_csv(result_path+'Public_annotated_test.csv', sep='\t', names=['tokens','tags'])

y_true = true_lbs['tags'].values
y_pred =  pred_lbs['tags'].values


class_labels = sorted([tag for tag in set(y_true) if tag != 'O'], key=lambda name: (name[1:], name[0]))

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, labels = class_labels, output_dict=True))


#               precision    recall  f1-score   support

#         B-DS       0.74      0.53      0.62      1323
#         I-DS       0.84      0.70      0.76       486
#         B-GP       0.76      0.52      0.62      3056
#         I-GP       0.89      0.41      0.56      1989
#         B-OG       0.93      0.69      0.79      1960
#         I-OG       0.98      0.51      0.68       968

#    micro avg       0.83      0.54      0.66      9782
#    macro avg       0.85      0.56      0.67      9782
# weighted avg       0.84      0.54      0.65      9782

              precision    recall  f1-score   support

        B-DS       0.68      0.74      0.70      1310
        I-DS       0.61      0.71      0.66       536
        B-GP       0.81      0.82      0.81      3367
        I-GP       0.57      0.87      0.69      2245
        B-OG       0.77      0.57      0.65      2545
        I-OG       0.85      0.74      0.79      1251

   micro avg       0.71      0.75      0.73     11254
   macro avg       0.72      0.74      0.72     11254
weighted avg       0.73      0.75      0.73     11254



In [19]:
result_dict = classification_report(y_true, y_pred, labels = class_labels, output_dict=True)

result_df = pd.DataFrame(result_dict).transpose()

In [20]:
result_df

Unnamed: 0,precision,recall,f1-score,support
B-DS,0.675316,0.735115,0.703947,1310.0
I-DS,0.614767,0.714552,0.660915,536.0
B-GP,0.809944,0.817642,0.813775,3367.0
I-GP,0.574726,0.865033,0.690612,2245.0
B-OG,0.771444,0.568959,0.654907,2545.0
I-OG,0.845943,0.741807,0.79046,1251.0
micro avg,0.713245,0.747912,0.730167,11254.0
macro avg,0.715357,0.740518,0.719103,11254.0
weighted avg,0.73335,0.747912,0.730623,11254.0


In [50]:
report_list = []
entity_index = ['DS', 'GP', 'OG', 'micro avg', 'macro avg']
report_list.append(result_df.loc[['B-DS','I-DS']].mean(axis=0))
report_list.append(result_df.loc[['B-GP','I-GP']].mean(axis=0))
report_list.append(result_df.loc[['B-OG','I-OG']].mean(axis=0))
report_list.append(result_df.loc[['micro avg']].mean(axis=0))
report_list.append(result_df.loc[['macro avg']].mean(axis=0))
# report_list.append(result_df.loc[['weighted avg']].mean(axis=0))

report_df = pd.concat(report_list, axis=1).T
# report_df.reindex(entity_index)
report_df.index = entity_index
print(report_df[['precision','recall','f1-score']])

           precision    recall  f1-score
DS          0.645041  0.724833  0.682431
GP          0.692335  0.841338  0.752193
OG          0.808694  0.655383  0.722684
micro avg   0.713245  0.747912  0.730167
macro avg   0.715357  0.740518  0.719103


In [58]:
true_lbs = pd.read_csv(result_path+'test_manual_annotated_on_public.csv', sep='\t', names=['tokens','tags'])
pred_lbs = pd.read_csv(result_path+'Public_annotated_test.csv', sep='\t', names=['tokens','tags'])

true_lbs['tags'].replace('B-|I-','',regex=True, inplace=True)
pred_lbs['tags'].replace('B-|I-','',regex=True, inplace=True)

y_true = true_lbs['tags'].values
y_pred =  pred_lbs['tags'].values


class_labels = sorted([tag for tag in set(y_true) if tag != 'O'], key=lambda name: (name[1:], name[0]))
print(classification_report(y_true, y_pred, labels = class_labels))

In [59]:
y_true = true_lbs['tags'].values
y_pred =  pred_lbs['tags'].values


class_labels = sorted([tag for tag in set(y_true) if tag != 'O'], key=lambda name: (name[1:], name[0]))
print(classification_report(y_true, y_pred, labels = class_labels))

              precision    recall  f1-score   support

          OG       0.82      0.64      0.72      3796
          GP       0.73      0.89      0.80      5612
          DS       0.70      0.77      0.73      1846

   micro avg       0.75      0.79      0.77     11254
   macro avg       0.75      0.77      0.75     11254
weighted avg       0.76      0.79      0.76     11254

