In [1]:
# Code to evaluate EPMC annotation to the Manual Annotation
# Code to compare results from ML methods and EPMC annotation

# (c) EMBL-EBI, September 2019
#
# Started: 23 Septmember  2019
# Updated: 24 Septmember  2019

_author_ = 'Santosh Tirunagari'

import os
import pandas as pd
import glob
import json
import csv
import sys

import multiprocessing

import numpy as np
import re

from nltk.tokenize import wordpunct_tokenize

import requests
# from pprint import pprint
import pandas as pd

from collections import defaultdict, Counter
import time
from requests.compat import urljoin

from tqdm import tqdm

In [2]:
# Generate train, test and dev pmc ids from the random seed set to 2222
import math
import random
file = '/nfs/gns/literature/Santosh_Tirunagari/EBI standard Dataset/NER/list_pmc_ids.csv'
percentage=0.70
iter = 0

trainPMCids = []
devPMCids = []
testPMCids =[]
try:
    with open(file, 'r',encoding="utf-8") as fin:
        allPMCids = fin.readlines()
except:
    with open('/mnt/droplet'+file, 'r',encoding="utf-8") as fin:
        allPMCids = fin.readlines()    
    
nLines = sum(1 for line in allPMCids)
nTrain = int(nLines*percentage) 
nValid = math.floor((nLines - nTrain)/2)
nTest = nLines - (nTrain+nValid)

deck = list(range(0, nLines))
random.seed(2222) # Please dont change the seed for the reproducibility 
random.shuffle(deck)

train_ids = deck[0:nTrain]
devel_ids = deck[nTrain:nTrain+nValid]
test_ids = deck[nTrain+nValid:nTrain+nValid+nTest]

for each_pmc_id in allPMCids:
    if iter in train_ids:
        trainPMCids.append(each_pmc_id.strip())
    elif iter in devel_ids:
        devPMCids.append(each_pmc_id.strip())
    else:
        testPMCids.append(each_pmc_id.strip())

    iter = iter+1    

In [3]:
def get_Json_through_PMCID(pmcid):
    
    base_url = "https://www.ebi.ac.uk/europepmc/annotations_api/"
    article_url = urljoin(base_url, "annotationsByArticleIds?articleIds=PMC%3A"+pmcid+"&provider=Europe%20PMC&format=JSON")
    r = requests.get(article_url)
    
    if r.status_code == 200:
        return r
    else:
        return False
    

In [4]:
# result_path = '/mnt/droplet/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'
result_path = '/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'

with open(result_path+'EPMC_annotations_.csv','w',  newline='\n') as f1:
    test_writer=csv.writer(f1, delimiter='\t',lineterminator='\n')
    
    for each_test_pmc_id in testPMCids:
        ss = get_Json_through_PMCID(each_test_pmc_id[3:]) # Just the number is needed. SO remove the PMC from the front
        if ss:
            json_results = ss.json()
            pmc_id = json_results[0]['pmcid']
            print(pmc_id)
            for each_annotation in json_results[0]['annotations']:
                exact = each_annotation['prefix']+ each_annotation['exact']+each_annotation['postfix']
                token = each_annotation['tags'][0]['name']
                ner = each_annotation['type']

                row = [pmc_id, exact, token, ner]
                test_writer.writerow(row)
        else:
            continue    

PMC4556948
PMC3651197
PMC5502978
PMC5259676
PMC3613406
PMC3972685
PMC4618948
PMC1762380
PMC4540425
PMC3960246
PMC5770482
PMC4766309
PMC3031208
PMC3316545
PMC5510223
PMC2727484
PMC3174205
PMC2761781
PMC5666160
PMC6037156
PMC1971115
PMC4352028
PMC3844564
PMC4244103
PMC4753424
PMC4697806
PMC3029330
PMC5078810


In [5]:
manual_annot_csv = pd.read_csv('/nfs/gns/literature/Santosh_Tirunagari/EBI standard Dataset/CSV/manual_annot_exacts_180.csv', names=['pmc_id', 'sent_id', 'sentence','ner','relation'], sep ='\t')
manual_annot_csv = manual_annot_csv[manual_annot_csv['ner']!= 'No-Ner']

EPMC_annot_csv = pd.read_csv(result_path+'EPMC_annotations_.csv', names=['pmc_id', 'sentence','token','ner'], sep ='\t')

In [6]:
def sentences_tags(pmc_id, EPMC_annot_csv,manual_annot_csv):
    all_europe_pm_sentences = EPMC_annot_csv[EPMC_annot_csv['pmc_id'] == pmc_id]['sentence'].tolist()
    manual_annotated_sentences = manual_annot_csv[manual_annot_csv['pmc_id'] == pmc_id]['sentence'].tolist()
    
    full_sentences = []
    for each_sentence in tqdm(all_europe_pm_sentences):
        try:
            res = [x for x in manual_annotated_sentences if re.search(re.escape(each_sentence), x)]
            full_sentences.append(res[0])
        except:
            full_sentences.append('None')
    
    new_epmc = EPMC_annot_csv[EPMC_annot_csv['pmc_id'] == pmc_id]
    new_epmc = new_epmc.assign(full_sentence = full_sentences)
    new_epmc['combined'] = new_epmc.apply(lambda x: list([x['token'],x['ner']]),axis=1) 
    sent_tags = new_epmc.groupby('full_sentence')['combined'].apply(list).reset_index(name='tags')
    
    new_man_annot = manual_annot_csv[manual_annot_csv['pmc_id'] == pmc_id]
    new_man_annot = manual_annot_csv[manual_annot_csv['pmc_id'] == pmc_id].reset_index()
    new_man_annot.rename(columns={'sentence':'full_sentence'}, inplace=True)
    epmc_sentence_tags = pd.merge(new_man_annot, sent_tags, on='full_sentence', how='left') 

    
    return epmc_sentence_tags
        
        

In [7]:
from ast import literal_eval

def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll))

    return results


def convert2IOB_dict_manual_annotations(text_data,ner_tags):
    
    tokens = []
    ners = []
    
    split_text = wordpunct_tokenize(text_data)
    # for each word token append 'O'
    arr = ['O']*len(split_text) 
    
    if ner_tags != 'No-Ner':
        ner_tags = literal_eval(ner_tags)
        for each_tag in ner_tags:
            token_list = wordpunct_tokenize(each_tag[2])
            ner_list = wordpunct_tokenize(each_tag[3])

            if(len(token_list) > len(ner_list)):
                ner_list = len(token_list) * ner_list

            for i in range(0,len(ner_list)):
                # The logic here is look for the first B-tag and then append I-tag next
                if(i==0):
                    ner_list[i] = 'B-'+ner_list[i]
                else:
                    ner_list[i] = 'I-'+ner_list[i]

            tokens.append(token_list)
            ners.append(ner_list)

        for i in range(0, len(tokens)):
            spans = find_sub_list(tokens[i], split_text)
            for each_span in spans:
                arr[each_span[0]:each_span[1]] = ners[i]
    
        return zip(split_text, arr)

    else:
        return zip(split_text, arr)
            
            
    
    
def convert2IOB_dict_EPMC_annotations(text_data,ner_tags):
    
    tokens = []
    ners = []
    
    split_text = wordpunct_tokenize(text_data)
    # for each word token append 'O'
    arr = ['O']*len(split_text)
        

    if ner_tags != 'No-Ner':
        for each_tag in ner_tags:
            token_list = wordpunct_tokenize(each_tag[0])
            ner_list = wordpunct_tokenize(each_tag[1])

            if(len(token_list) > len(ner_list)):
                ner_list = len(token_list) * ner_list

            for i in range(0,len(ner_list)):
                # The logic here is look for the first B-tag and then append I-tag next
                if(i==0):
                    ner_list[i] = 'B-'+ner_list[i]
                else:
                    ner_list[i] = 'I-'+ner_list[i]

            tokens.append(token_list)
            ners.append(ner_list)

        for i in range(0, len(tokens)):
            spans = find_sub_list(tokens[i], split_text)
            for each_span in spans:
                arr[each_span[0]:each_span[1]] = ners[i]

        return zip(split_text, arr)
    else:
        return zip(split_text, arr)

In [8]:
from nltk.tokenize import wordpunct_tokenize

result_path = '/nfs/gns/literature/Santosh_Tirunagari/EPMC Annotations Dataset/'

with open(result_path+'EPMC_annotated_test.csv','w',  newline='\n') as f1, open(result_path+'manual_annotated_test.csv','w',  newline='\n') as f2: 
    epmc_writer=csv.writer(f1, delimiter='\t',lineterminator='\n')
    manual_writer=csv.writer(f2, delimiter='\t',lineterminator='\n')
    
    for each_pmc_id in testPMCids:
        print(each_pmc_id)
        ss_ = sentences_tags(each_pmc_id, EPMC_annot_csv,manual_annot_csv)
        ss = ss_.where((pd.notnull(ss_)), 'No-Ner')
        for index, row in tqdm(ss.iterrows(), total=ss.shape[0]):
            tagged_tokens = convert2IOB_dict_EPMC_annotations(row['full_sentence'],row['tags'])
            for each_word in tagged_tokens: # make it to manual annotation format for easy comparison
                epmc_tokens = list(each_word)
                if epmc_tokens[1] == 'B-Diseases':
                    epmc_tokens[1] = 'B-DS'
                elif epmc_tokens[1] == 'I-Diseases':
                    epmc_tokens[1] = 'I-DS'
                elif epmc_tokens[1] == 'B-Organisms':
                    epmc_tokens[1] = 'B-OG'
                elif epmc_tokens[1] == 'I-Organisms':
                    epmc_tokens[1] = 'I-OG'  
                elif epmc_tokens[1] == 'B-Gene_Proteins':
                    epmc_tokens[1] = 'B-GP'   
                elif epmc_tokens[1] == 'I-Gene_Proteins':
                    epmc_tokens[1] = 'I-GP'  
                else:
                    epmc_tokens[1] = 'O'
                epmc_writer.writerow(epmc_tokens)
            epmc_writer.writerow('') 
            manual_tagged_tokens = convert2IOB_dict_manual_annotations(row['full_sentence'],row['ner'])
            for each_word in manual_tagged_tokens:
                manual_writer.writerow(list(each_word))
            manual_writer.writerow('') 

 46%|████▋     | 204/441 [00:00<00:00, 922.18it/s]

PMC4556948


100%|██████████| 441/441 [00:00<00:00, 952.79it/s]
100%|██████████| 266/266 [00:00<00:00, 4479.78it/s]
 27%|██▋       | 158/578 [00:00<00:00, 1575.17it/s]

PMC3651197


100%|██████████| 578/578 [00:00<00:00, 1513.48it/s]
100%|██████████| 141/141 [00:00<00:00, 3842.58it/s]
100%|██████████| 345/345 [00:00<00:00, 3866.67it/s]
100%|██████████| 38/38 [00:00<00:00, 3322.78it/s]
100%|██████████| 113/113 [00:00<00:00, 2692.93it/s]


PMC5502978
PMC5259676


100%|██████████| 70/70 [00:00<00:00, 3804.11it/s]
 30%|███       | 122/400 [00:00<00:00, 1213.62it/s]

PMC3613406


100%|██████████| 400/400 [00:00<00:00, 1219.31it/s]
100%|██████████| 182/182 [00:00<00:00, 1392.80it/s]
100%|██████████| 343/343 [00:00<00:00, 2091.52it/s]
  0%|          | 0/96 [00:00<?, ?it/s]

PMC3972685


100%|██████████| 96/96 [00:00<00:00, 4024.04it/s]
100%|██████████| 153/153 [00:00<00:00, 1677.30it/s]
100%|██████████| 121/121 [00:00<00:00, 3474.58it/s]
  0%|          | 0/318 [00:00<?, ?it/s]

PMC4618948
PMC1762380


100%|██████████| 318/318 [00:00<00:00, 1739.34it/s]
100%|██████████| 118/118 [00:00<00:00, 3601.44it/s]
100%|██████████| 203/203 [00:00<00:00, 1933.10it/s]
100%|██████████| 110/110 [00:00<00:00, 4074.59it/s]
100%|██████████| 84/84 [00:00<00:00, 2662.63it/s]

PMC4540425
PMC3960246



100%|██████████| 63/63 [00:00<00:00, 4105.42it/s]
100%|██████████| 68/68 [00:00<00:00, 2879.69it/s]
100%|██████████| 52/52 [00:00<00:00, 3059.09it/s]
100%|██████████| 119/119 [00:00<00:00, 1366.71it/s]
  0%|          | 0/143 [00:00<?, ?it/s]

PMC5770482
PMC4766309


100%|██████████| 143/143 [00:00<00:00, 3736.96it/s]
100%|██████████| 83/83 [00:00<00:00, 3200.40it/s]
100%|██████████| 51/51 [00:00<00:00, 3631.12it/s]
 25%|██▌       | 122/483 [00:00<00:00, 1216.82it/s]

PMC3031208
PMC3316545


100%|██████████| 483/483 [00:00<00:00, 1228.98it/s]
100%|██████████| 180/180 [00:00<00:00, 3556.29it/s]
100%|██████████| 279/279 [00:00<00:00, 1493.55it/s]


PMC5510223


100%|██████████| 150/150 [00:00<00:00, 3820.99it/s]
 42%|████▏     | 161/387 [00:00<00:00, 1595.47it/s]

PMC2727484


100%|██████████| 387/387 [00:00<00:00, 1505.37it/s]
100%|██████████| 156/156 [00:00<00:00, 4596.79it/s]
 42%|████▏     | 163/390 [00:00<00:00, 1622.72it/s]

PMC3174205


100%|██████████| 390/390 [00:00<00:00, 1635.97it/s]
100%|██████████| 130/130 [00:00<00:00, 3563.67it/s]
100%|██████████| 14/14 [00:00<00:00, 99189.62it/s]
0it [00:00, ?it/s]
 35%|███▌      | 155/438 [00:00<00:00, 1549.38it/s]

PMC2761781
PMC5666160


100%|██████████| 438/438 [00:00<00:00, 1522.69it/s]
100%|██████████| 132/132 [00:00<00:00, 3831.50it/s]
100%|██████████| 180/180 [00:00<00:00, 1587.85it/s]
100%|██████████| 127/127 [00:00<00:00, 3873.93it/s]
  0%|          | 0/653 [00:00<?, ?it/s]

PMC6037156
PMC1971115


100%|██████████| 653/653 [00:00<00:00, 843.89it/s]
100%|██████████| 257/257 [00:00<00:00, 3615.94it/s]
100%|██████████| 426/426 [00:00<00:00, 2168.55it/s]


PMC4352028


100%|██████████| 87/87 [00:00<00:00, 3360.05it/s]
100%|██████████| 254/254 [00:00<00:00, 2042.96it/s]
100%|██████████| 146/146 [00:00<00:00, 5002.15it/s]
  0%|          | 0/520 [00:00<?, ?it/s]

PMC3844564
PMC4244103


100%|██████████| 520/520 [00:00<00:00, 7687.72it/s]
100%|██████████| 6/6 [00:00<00:00, 2811.82it/s]
100%|██████████| 176/176 [00:00<00:00, 2392.66it/s]
100%|██████████| 91/91 [00:00<00:00, 3716.00it/s]
  0%|          | 0/533 [00:00<?, ?it/s]

PMC4753424
PMC4697806


100%|██████████| 533/533 [00:00<00:00, 1139.37it/s]
100%|██████████| 194/194 [00:00<00:00, 3651.05it/s]
 37%|███▋      | 135/362 [00:00<00:00, 1345.34it/s]

PMC3029330


100%|██████████| 362/362 [00:00<00:00, 1277.92it/s]
100%|██████████| 179/179 [00:00<00:00, 4420.31it/s]
100%|██████████| 160/160 [00:00<00:00, 707899.41it/s]
0it [00:00, ?it/s]

PMC5078810





In [50]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

true_lbs = pd.read_csv(result_path+'manual_annotated_test.csv', sep='\t', names=['tokens','tags'])
pred_lbs = pd.read_csv(result_path+'EPMC_annotated_test.csv', sep='\t', names=['tokens','tags'])

y_true = true_lbs['tags'].values
y_pred =  pred_lbs['tags'].values


class_labels = sorted([tag for tag in set(y_true) if tag != 'O'], key=lambda name: (name[1:], name[0]))

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, labels = class_labels))


              precision    recall  f1-score   support

        B-DS       0.74      0.53      0.62      1323
        I-DS       0.84      0.70      0.76       486
        B-GP       0.76      0.52      0.62      3056
        I-GP       0.89      0.41      0.56      1989
        B-OG       0.93      0.69      0.79      1960
        I-OG       0.98      0.51      0.68       968

   micro avg       0.83      0.54      0.66      9782
   macro avg       0.85      0.56      0.67      9782
weighted avg       0.84      0.54      0.65      9782

