In [1]:
import sys
import random
import os
from pathlib import Path
import shutil
import json

import argparse
import tqdm
import spacy
from spacy.gold import minibatch
from spacy.language import Language
from spacy import util

In [2]:
from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
from scispacy.per_class_scorer import PerClassScorer
from scispacy.train_utils import evaluate_ner

In [19]:
# from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec

# name = "scibert-scivocab-uncased"
# path = "/nfs/gns/literature/Santosh_Tirunagari/pretrained_word_embeddings/scibert_scivocab_uncased"

# nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
# nlp.add_pipe(nlp.create_pipe("sentencizer"))
# nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, path))
# nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, path))



In [3]:
## Test best model performance on test set
best_model_path = '/nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/en-europepmc-lg/best'

print("Loading from", best_model_path)
nlp2 = util.load_model_from_path(best_model_path)



Loading from /nfs/gns/literature/Santosh_Tirunagari/GitHub/spacy_models/en-europepmc-lg/best


In [5]:
import pandas as pd
from tqdm import tqdm
from ast import literal_eval
import csv

In [6]:
import requests

def ground_annotation(exact,entity):
    grounded_value = 'not-grounded'
    
    root_url = 'https://www.ebi.ac.uk/ols/api/search?'
    URL = root_url
    
    if entity == 'DS':
        URL = root_url+'q='+exact+'&exact=false&ontology=efo'
    elif entity == 'OG':
        URL = root_url+'q='+exact+'&exact=false&ontology=ncbitaxon'
    elif entity == 'GP':
        URL = 'http://10.7.35.118:8157/solr/Genes/select?q='+exact
    else:
#         print(URL)
        return 'URL missing'
    
    r = requests.get(URL)

    if r.status_code ==200:
        json_data = r.json()
    else:
        return grounded_value


    if json_data['response']['numFound']!=0:
        if entity in ['DS', 'OG']:
            grounded_value = json_data['response']['docs'][0]['iri']
        elif entity in ['GP']:
            grounded_value = json_data['response']['docs'][0]['ID']
    
    return grounded_value    

In [7]:
article_path = '/nfs/gns/literature/machine-learning/evaluation/time_complexity/articles/'
article_file_path = article_path+'PMC3649237_sentences.txt'

with open(article_file_path, 'r') as f:
    article_contents = f.read()
    
doc = nlp2(article_contents)


In [10]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

chronic illness 380 395 DS
chronic illnesses 1451 1468 DS
chronic illness 1686 1701 DS
chronic illness 1731 1746 DS
chronically ill 2956 2971 DS
chronic illness 3288 3303 DS
and/or 4267 4273 GP
chronicle illness 13589 13606 DS
diseases studied 13642 13658 DS
lung diseases 13674 13687 DS
diabetes 13705 13713 DS
cancer 13715 13721 DS
stroke 13727 13733 DS
Chronic illness 13736 13751 DS
infectious diseases 13860 13879 DS
chronicle illness 15382 15399 DS
or 18406 18408 GP
chronic illnesses 19485 19502 DS
chronic illness 23410 23425 DS
chronic illness 23513 23528 DS
chronic illness 23654 23669 DS
chronic illnesses 24953 24970 DS
chronic illnesses 27479 27496 DS
Chronically ill 29770 29785 DS
ulcer 30865 30870 DS
human 31028 31033 OG
BP 32359 32361 DS
postoperative problems 34298 34320 DS
ischemic heart failure 34533 34555 DS
heart failure 35787 35800 DS
heart failure 36042 36055 DS
heart failure 36100 36113 DS
chronic illnesses 36777 36794 DS
cancer 37222 37228 DS
diabetes 38190 38198 DS
di

In [11]:
import time
import glob
import os
import json

article_path = '/nfs/gns/literature/machine-learning/evaluation/time_complexity/articles/'
all_files = sorted(glob.glob(article_path + '*.txt*'))
result_json_dump_path = '/nfs/gns/literature/machine-learning/evaluation/time_complexity/en-pubmed-pmc-lg/'

offset = 26

already_processed_files = sorted(glob.glob(result_json_dump_path + '*.json*'))
names = [os.path.basename(x) for x in already_processed_files]
already_processed_PMC_ids = [x[:-5]+'.json' for x in names]

# article_file_path = article_path+'PMC3649237_sentences.txt'

for article_file_path in all_files:
    
    result_file_name = os.path.basename(article_file_path)[:-4] + '.json'
    
    if not result_file_name in already_processed_PMC_ids:
        with open(article_file_path, 'r') as f:
            article_contents = f.read()

            tstart = time.time()

#             predicted_sentences = flair_model.predict(sentences)
            doc = nlp2(article_contents)
            entity_dict={}
            all_entities = []
            list_names = ['exact', 'prefix','postfix','entity','ground']

            for ent in doc.ents:
#                 print(ent.text, ent.start_char, ent.end_char, ent.label_)
                exact = ent.text
                prefix = article_contents[ent.start_char-offset:ent.start_char-1]
                postfix = article_contents[ent.end_char+1:ent.end_char+offset]
                entity = ent.label_
                normalise = ground_annotation(exact,entity)
                data_names = [exact, prefix,postfix,entity,normalise]
                all_entities.append(dict(zip(list_names,data_names)))

            tend = time.time()
            t_elapsed = round(tend - tstart)
            print(t_elapsed) 

            entity_dict['tagged_entities'] = all_entities
            entity_dict['time_taken'] = t_elapsed

            with open(result_json_dump_path+result_file_name, 'w') as outfile:
                json.dump(entity_dict, outfile)

5
3
16
6
10
2
2
7
5
7
9
1
5
4
8
6
17
4
21
5
10
3
4
9
17
32
12
10
8
11
5
10
7
2
20
25
5
19
10
5
5
10
10
17
18
4
2
5
15
2
3
12
9
0
9
0
27
5
7
14
14
0
0
4
2
2
7
15
6
2
7
7
7
0
25
2
1
3
4
6
5
3
8
2
5
13
3
5
8
8
9
17
5
11
2
10
10
9
4
18
3
4
5
6
10
25
3
10
10
4
6
2
1
7
2
7
22
8
30
2
6
7
17
2
16
8
6
6
10
1
10
7
7
7
7
7
4
6
1
14
11
10
4
5
2
19
11
23
11
11
7
3
1
11
5
6
13
9
13
7
2
9
13
1
4
8
10
4
24
3
7
17
7
9
28
15
6
12
14
17
9
1
5
7
12
12
11
7
2
13
13
4
2
7
16
10
24
11
3
15
3
1
15
11
18
15
5
26
26
3
6
10
8
4
19
10
11
107
2
18
13
11
8
6
20
11
22
5
2
12
3
23
13
5
11
6
6
6
21
13
19
11
25
35
26
16
4
8
5
10
2
6
6
8
14
3
10
6
7
7
8
4
21
7
19
11
12
7
17
4
11
7
3
9
12
2
4
20
11
8
4
18
9
5
19
9
6
7
18
8
11
6
5
9
17
14
21
14
1
8
4
7
14
5
4
21
8
3
1
8
3
4
7
14
1
3
10
10
16
10
7
9
17
10
9
8
6
1
1
3
7
15
11
2
11
12
7
4
5
5
7
1
10
8
6
11
2
2
2
7
4
3
1
1
10
4
10
9
10
21
1
11
9
4
17
1
9
17
16
3
4
6
16
11
2
11
15
5
15
1
13
14
2
6
16
5
6
2
5
14
4
7
6
32
10
7
13
17
5
6
5
8
5
24
24
11
8
2
12
5
16
4
1
21
2
2
8
8
