In [1]:
# Code to evaluate EPMC annotation to the Manual Annotation
# Code to compare results from ML methods and EPMC annotation

# (c) EMBL-EBI, September 2019
#
# Started: 23 Septmember  2019
# Updated: 24 Septmember  2019

_author_ = 'Santosh Tirunagari'

import os
import pandas as pd
import glob
import json
import csv
import sys

import multiprocessing

import numpy as np
import re

from nltk.tokenize import wordpunct_tokenize

import requests
# from pprint import pprint
import pandas as pd

from collections import defaultdict, Counter
import time
from requests.compat import urljoin

from tqdm import tqdm

In [2]:
# Generate train, test and dev pmc ids from the random seed set to 2222
import math
import random
file = '/nfs/gns/literature/Santosh_Tirunagari/EBI standard Dataset/NER/list_pmc_ids.csv'
percentage=0.70
iter = 0

trainPMCids = []
devPMCids = []
testPMCids =[]
try:
    with open(file, 'r',encoding="utf-8") as fin:
        allPMCids = fin.readlines()
except:
    with open('/mnt/droplet'+file, 'r',encoding="utf-8") as fin:
        allPMCids = fin.readlines()    
    
nLines = sum(1 for line in allPMCids)
nTrain = int(nLines*percentage) 
nValid = math.floor((nLines - nTrain)/2)
nTest = nLines - (nTrain+nValid)

deck = list(range(0, nLines))
random.seed(2222) # Please dont change the seed for the reproducibility 
random.shuffle(deck)

train_ids = deck[0:nTrain]
devel_ids = deck[nTrain:nTrain+nValid]
test_ids = deck[nTrain+nValid:nTrain+nValid+nTest]

for each_pmc_id in allPMCids:
    if iter in train_ids:
        trainPMCids.append(each_pmc_id.strip())
    elif iter in devel_ids:
        devPMCids.append(each_pmc_id.strip())
    else:
        testPMCids.append(each_pmc_id.strip())

    iter = iter+1    

In [3]:
def get_Json_through_PMCID(pmcid):
    
    base_url = "https://www.ebi.ac.uk/europepmc/annotations_api/"
    article_url = urljoin(base_url, "annotationsByArticleIds?articleIds=PMC%3A"+pmcid+"&provider=Europe%20PMC&format=JSON")
    r = requests.get(article_url)
    
    if r.status_code == 200:
        return r
    else:
        return False
    

In [4]:
# result_path = '/mnt/droplet/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'
result_path = '/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'

with open(result_path+'EPMC_annotations_.csv','w',  newline='\n') as f1:
    test_writer=csv.writer(f1, delimiter='\t',lineterminator='\n')
    
    for each_test_pmc_id in testPMCids:
        ss = get_Json_through_PMCID(each_test_pmc_id[3:]) # Just the number is needed. SO remove the PMC from the front
        if ss:
            json_results = ss.json()
            pmc_id = json_results[0]['pmcid']
            print(pmc_id)
            for each_annotation in json_results[0]['annotations']:
                exact = each_annotation['prefix']+ each_annotation['exact']+each_annotation['postfix']
                token = each_annotation['tags'][0]['name']
                ner = each_annotation['type']

                row = [pmc_id, exact, token, ner]
                test_writer.writerow(row)
        else:
            continue    
            
# PMC4556948
# PMC3651197
# PMC5502978
# PMC5259676
# PMC3613406
# PMC3972685
# PMC4618948
# PMC1762380
# PMC4540425
# PMC3960246
# PMC5770482
# PMC4766309
# PMC3031208
# PMC3316545
# PMC5510223
# PMC2727484
# PMC3174205
# PMC2761781
# PMC5666160
# PMC6037156
# PMC1971115
# PMC4352028
# PMC3844564
# PMC4244103
# PMC4753424
# PMC4697806
# PMC3029330
# PMC5078810

PMC3174205
PMC4552872
PMC5972578
PMC5006041
PMC4978644
PMC5891899
PMC5120353
PMC3291930
PMC4768280
PMC4302291
PMC3949526
PMC5731848
PMC4244103
PMC3611597
PMC4556948
PMC5962829
PMC3024232
PMC3938772
PMC2474741
PMC3260253
PMC4973533
PMC6008929
PMC2478677
PMC4791522
PMC4753424
PMC4697806
PMC3029330
PMC5078810


In [5]:
manual_annot_csv = pd.read_csv('/nfs/gns/literature/Santosh_Tirunagari/EBI standard Dataset/CSV/manual_annot_exacts_180.csv', names=['pmc_id', 'sent_id', 'sentence','ner','relation'], sep ='\t')
manual_annot_csv = manual_annot_csv[manual_annot_csv['ner']!= 'No-Ner']

EPMC_annot_csv = pd.read_csv(result_path+'EPMC_annotations_.csv', names=['pmc_id', 'sentence','token','ner'], sep ='\t')

In [6]:
def sentences_tags(pmc_id, EPMC_annot_csv,manual_annot_csv):
    all_europe_pm_sentences = EPMC_annot_csv[EPMC_annot_csv['pmc_id'] == pmc_id]['sentence'].tolist()
    manual_annotated_sentences = manual_annot_csv[manual_annot_csv['pmc_id'] == pmc_id]['sentence'].tolist()
    
    full_sentences = []
    for each_sentence in tqdm(all_europe_pm_sentences):
        try:
            res = [x for x in manual_annotated_sentences if re.search(re.escape(each_sentence), x)]
            full_sentences.append(res[0])
        except:
            full_sentences.append('None')
    
    new_epmc = EPMC_annot_csv[EPMC_annot_csv['pmc_id'] == pmc_id]
    new_epmc = new_epmc.assign(full_sentence = full_sentences)
    new_epmc['combined'] = new_epmc.apply(lambda x: list([x['token'],x['ner']]),axis=1) 
    sent_tags = new_epmc.groupby('full_sentence')['combined'].apply(list).reset_index(name='tags')
    
    new_man_annot = manual_annot_csv[manual_annot_csv['pmc_id'] == pmc_id]
    new_man_annot = manual_annot_csv[manual_annot_csv['pmc_id'] == pmc_id].reset_index()
    new_man_annot.rename(columns={'sentence':'full_sentence'}, inplace=True)
    epmc_sentence_tags = pd.merge(new_man_annot, sent_tags, on='full_sentence', how='left') 

    
    return epmc_sentence_tags
        
        

In [7]:
from ast import literal_eval

def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll))

    return results


def convert2IOB_dict_manual_annotations(text_data,ner_tags):
    
    tokens = []
    ners = []
    
    split_text = wordpunct_tokenize(text_data)
    # for each word token append 'O'
    arr = ['O']*len(split_text) 
    
    if ner_tags != 'No-Ner':
        ner_tags = literal_eval(ner_tags)
        for each_tag in ner_tags:
            token_list = wordpunct_tokenize(each_tag[2])
            ner_list = wordpunct_tokenize(each_tag[3])

            if(len(token_list) > len(ner_list)):
                ner_list = len(token_list) * ner_list

            for i in range(0,len(ner_list)):
                # The logic here is look for the first B-tag and then append I-tag next
                if(i==0):
                    ner_list[i] = 'B-'+ner_list[i]
                else:
                    ner_list[i] = 'I-'+ner_list[i]

            tokens.append(token_list)
            ners.append(ner_list)

        for i in range(0, len(tokens)):
            spans = find_sub_list(tokens[i], split_text)
            for each_span in spans:
                arr[each_span[0]:each_span[1]] = ners[i]
    
        return zip(split_text, arr)

    else:
        return zip(split_text, arr)
            
            
    
    
def convert2IOB_dict_EPMC_annotations(text_data,ner_tags):
    
    tokens = []
    ners = []
    
    split_text = wordpunct_tokenize(text_data)
    # for each word token append 'O'
    arr = ['O']*len(split_text)
        

    if ner_tags != 'No-Ner':
        for each_tag in ner_tags:
            token_list = wordpunct_tokenize(each_tag[0])
            ner_list = wordpunct_tokenize(each_tag[1])

            if(len(token_list) > len(ner_list)):
                ner_list = len(token_list) * ner_list

            for i in range(0,len(ner_list)):
                # The logic here is look for the first B-tag and then append I-tag next
                if(i==0):
                    ner_list[i] = 'B-'+ner_list[i]
                else:
                    ner_list[i] = 'I-'+ner_list[i]

            tokens.append(token_list)
            ners.append(ner_list)

        for i in range(0, len(tokens)):
            spans = find_sub_list(tokens[i], split_text)
            for each_span in spans:
                arr[each_span[0]:each_span[1]] = ners[i]

        return zip(split_text, arr)
    else:
        return zip(split_text, arr)

In [8]:
from nltk.tokenize import wordpunct_tokenize

result_path = '/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'

with open(result_path+'EPMC_annotated_test.csv','w',  newline='\n') as f1, open(result_path+'manual_annotated_test.csv','w',  newline='\n') as f2: 
    epmc_writer=csv.writer(f1, delimiter='\t',lineterminator='\n')
    manual_writer=csv.writer(f2, delimiter='\t',lineterminator='\n')
    
    for each_pmc_id in testPMCids:
        print(each_pmc_id)
        ss_ = sentences_tags(each_pmc_id, EPMC_annot_csv,manual_annot_csv)
        ss = ss_.where((pd.notnull(ss_)), 'No-Ner')
        for index, row in tqdm(ss.iterrows(), total=ss.shape[0]):
            tagged_tokens = convert2IOB_dict_EPMC_annotations(row['full_sentence'],row['tags'])
            for each_word in tagged_tokens: # make it to manual annotation format for easy comparison
                epmc_tokens = list(each_word)
                if epmc_tokens[1] == 'B-Diseases':
                    epmc_tokens[1] = 'B-DS'
                elif epmc_tokens[1] == 'I-Diseases':
                    epmc_tokens[1] = 'I-DS'
                elif epmc_tokens[1] == 'B-Organisms':
                    epmc_tokens[1] = 'B-OG'
                elif epmc_tokens[1] == 'I-Organisms':
                    epmc_tokens[1] = 'I-OG'  
                elif epmc_tokens[1] == 'B-Gene_Proteins':
                    epmc_tokens[1] = 'B-GP'   
                elif epmc_tokens[1] == 'I-Gene_Proteins':
                    epmc_tokens[1] = 'I-GP'  
                else:
                    epmc_tokens[1] = 'O'
                epmc_writer.writerow(epmc_tokens)
            epmc_writer.writerow('') 
            manual_tagged_tokens = convert2IOB_dict_manual_annotations(row['full_sentence'],row['ner'])
            for each_word in manual_tagged_tokens:
                manual_writer.writerow(list(each_word))
            manual_writer.writerow('') 

 28%|██▊       | 109/390 [00:00<00:00, 1081.65it/s]

PMC3174205


100%|██████████| 390/390 [00:00<00:00, 1118.37it/s]
100%|██████████| 130/130 [00:00<00:00, 2474.89it/s]
100%|██████████| 58/58 [00:00<00:00, 1862.34it/s]
100%|██████████| 68/68 [00:00<00:00, 2785.09it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

PMC4552872
PMC5972578


100%|██████████| 215/215 [00:00<00:00, 2101.08it/s]
100%|██████████| 45/45 [00:00<00:00, 1782.45it/s]
100%|██████████| 217/217 [00:00<00:00, 1464.67it/s]


PMC5006041


100%|██████████| 95/95 [00:00<00:00, 2596.30it/s]
100%|██████████| 188/188 [00:00<00:00, 2001.41it/s]
100%|██████████| 47/47 [00:00<00:00, 1864.95it/s]
  0%|          | 0/606 [00:00<?, ?it/s]

PMC4978644
PMC5891899


100%|██████████| 606/606 [00:00<00:00, 982.91it/s]
100%|██████████| 141/141 [00:00<00:00, 2189.65it/s]
  8%|▊         | 65/843 [00:00<00:01, 644.10it/s]

PMC5120353


100%|██████████| 843/843 [00:01<00:00, 660.98it/s]
100%|██████████| 241/241 [00:00<00:00, 2331.51it/s]
100%|██████████| 52/52 [00:00<00:00, 234772.67it/s]
0it [00:00, ?it/s]
100%|██████████| 83/83 [00:00<00:00, 1096.86it/s]
100%|██████████| 81/81 [00:00<00:00, 2517.07it/s]
  0%|          | 0/435 [00:00<?, ?it/s]

PMC3291930
PMC4768280
PMC4302291


100%|██████████| 435/435 [00:00<00:00, 1963.19it/s]
100%|██████████| 56/56 [00:00<00:00, 2513.44it/s]
100%|██████████| 196/196 [00:00<00:00, 2074.33it/s]
100%|██████████| 38/38 [00:00<00:00, 1668.90it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

PMC3949526
PMC5731848


100%|██████████| 1000/1000 [00:01<00:00, 794.44it/s]
100%|██████████| 184/184 [00:00<00:00, 1691.93it/s]
100%|██████████| 520/520 [00:00<00:00, 4961.25it/s]
100%|██████████| 6/6 [00:00<00:00, 1888.48it/s]
  0%|          | 0/441 [00:00<?, ?it/s]

PMC4244103
PMC3611597


100%|██████████| 441/441 [00:00<00:00, 1011.82it/s]
100%|██████████| 136/136 [00:00<00:00, 2051.37it/s]
 13%|█▎        | 58/441 [00:00<00:00, 571.10it/s]

PMC4556948


100%|██████████| 441/441 [00:00<00:00, 632.41it/s]
100%|██████████| 266/266 [00:00<00:00, 2906.01it/s]
100%|██████████| 152/152 [00:00<00:00, 1284.04it/s]
100%|██████████| 77/77 [00:00<00:00, 2715.88it/s]
  0%|          | 0/431 [00:00<?, ?it/s]

PMC5962829
PMC3024232


100%|██████████| 431/431 [00:00<00:00, 782.85it/s]
100%|██████████| 211/211 [00:00<00:00, 2756.51it/s]
 31%|███       | 83/270 [00:00<00:00, 829.63it/s]

PMC3938772


100%|██████████| 270/270 [00:00<00:00, 867.06it/s]
100%|██████████| 168/168 [00:00<00:00, 2710.83it/s]
100%|██████████| 228/228 [00:00<00:00, 1550.19it/s]
  0%|          | 0/73 [00:00<?, ?it/s]

PMC2474741


100%|██████████| 73/73 [00:00<00:00, 1999.94it/s]
100%|██████████| 198/198 [00:00<00:00, 1156.20it/s]


PMC3260253


100%|██████████| 105/105 [00:00<00:00, 2351.44it/s]
100%|██████████| 294/294 [00:00<00:00, 1753.63it/s]


PMC4973533


100%|██████████| 54/54 [00:00<00:00, 1545.23it/s]
 22%|██▏       | 90/414 [00:00<00:00, 891.90it/s]

PMC6008929


100%|██████████| 414/414 [00:00<00:00, 876.12it/s]
100%|██████████| 143/143 [00:00<00:00, 1476.45it/s]
 29%|██▉       | 87/296 [00:00<00:00, 866.36it/s]

PMC2478677


100%|██████████| 296/296 [00:00<00:00, 1014.87it/s]
100%|██████████| 134/134 [00:00<00:00, 2625.17it/s]
 39%|███▉      | 133/342 [00:00<00:00, 1315.77it/s]

PMC4791522


100%|██████████| 342/342 [00:00<00:00, 1342.09it/s]
100%|██████████| 93/93 [00:00<00:00, 2000.37it/s]
100%|██████████| 176/176 [00:00<00:00, 1600.97it/s]
100%|██████████| 91/91 [00:00<00:00, 2229.93it/s]
  0%|          | 0/533 [00:00<?, ?it/s]

PMC4753424
PMC4697806


100%|██████████| 533/533 [00:00<00:00, 753.80it/s]
100%|██████████| 194/194 [00:00<00:00, 1721.58it/s]
 13%|█▎        | 46/362 [00:00<00:00, 450.38it/s]

PMC3029330


100%|██████████| 362/362 [00:00<00:00, 704.40it/s]
100%|██████████| 179/179 [00:00<00:00, 2630.78it/s]
100%|██████████| 160/160 [00:00<00:00, 481067.13it/s]
0it [00:00, ?it/s]

PMC5078810





In [9]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

true_lbs = pd.read_csv(result_path+'manual_annotated_test.csv', sep='\t', names=['tokens','tags'])
pred_lbs = pd.read_csv(result_path+'EPMC_annotated_test.csv', sep='\t', names=['tokens','tags'])

y_true = true_lbs['tags'].values
y_pred =  pred_lbs['tags'].values


class_labels = sorted([tag for tag in set(y_true) if tag != 'O'], key=lambda name: (name[1:], name[0]))

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, labels = class_labels))


#               precision    recall  f1-score   support

#         B-DS       0.74      0.53      0.62      1323
#         I-DS       0.84      0.70      0.76       486
#         B-GP       0.76      0.52      0.62      3056
#         I-GP       0.89      0.41      0.56      1989
#         B-OG       0.93      0.69      0.79      1960
#         I-OG       0.98      0.51      0.68       968

#    micro avg       0.83      0.54      0.66      9782
#    macro avg       0.85      0.56      0.67      9782
# weighted avg       0.84      0.54      0.65      9782

              precision    recall  f1-score   support

        B-DS       0.64      0.56      0.60      1154
        I-DS       0.70      0.60      0.65       475
        B-GP       0.85      0.64      0.73      3246
        I-GP       0.91      0.37      0.52      2243
        B-OG       0.91      0.76      0.83      2374
        I-OG       0.95      0.77      0.85      1185

   micro avg       0.85      0.61      0.71     10677
   macro avg       0.82      0.62      0.69     10677
weighted avg       0.85      0.61      0.70     10677

