In [249]:
import glob
import gzip
from bs4 import BeautifulSoup
import lxml
from collections import defaultdict
from tqdm import tqdm
import requests
import random
import sys
import pathlib
import csv
import pandas as pd

import argparse

# import multiprocessing
from fuzzywuzzy import fuzz

import numpy as np
import itertools
import re

# set the system path
sys.path.insert(1, '/nfs/gns/literature/machine-learning/Santosh/Gitlab/biobertepmc/')

# BioBERT NER models
import logging
import torch
from torch.utils.data import DataLoader
import pickle
from biobert.model.bert_crf_model import BertCRF
from biobert.data_loader.epmc_loader import NERDatasetBatch
from biobert.utils.utils import my_collate
from statistics import mean
from collections import namedtuple


# Relations and associations model
import en_ner_europepmc_md
import en_relationv01

import unicodedata
# import datetime

from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

In [79]:
Entity = namedtuple('Entity', ['span', 'tag', 'text', 'pre', 'post'])
Entity_Label = namedtuple('Label', ['index', 'pos', 'tag', 'span'])

In [None]:
MODEL_PATH = '/nfs/gns/literature/machine-learning/Santosh/Gitlab/biobertepmc/reproduce_GP_DS_OG_CD/1604049631/'

# path to the file that has model parameters
params_path = MODEL_PATH + "params.pickle"
with open(params_path, 'rb') as f:
    params = pickle.load(f)
params['max_ner_token_len'] = -1
params['max_bert_token_len'] = -1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

ml_model = MLModel()

In [3]:
# !which conda

/nfs/gns/literature/Santosh_Tirunagari/miniconda3/condabin/conda


In [250]:
!which python

/nfs/gns/literature/Santosh_Tirunagari/miniconda3/envs/scispacy/bin/python


In [4]:
# load association and relation models
relation_model1 = en_relationv01.load()
relation_model2 = en_ner_europepmc_md.load()

In [5]:
batch_size = 8
class MLModel:
    def __init__(self):
        self.bertCrf_model = load_model()

        # bertCrf_model.load_state_dict(torch.load('/homes/yangx/home/gitrepo/biobertepmc/model/bert_crf_model.states', map_location=device))
        self.bertCrf_model.load_state_dict(torch.load(MODEL_PATH + 'bert_crf_model.states', map_location=device))
        self.bertCrf_model.bert_model.bert_model.to(device)

    def post(self, sentences):
        BATCH_SIZE = 16
        text = sentences
        # print(text)
        with torch.no_grad():
            processor, tokens, spans = load_data_processor(text)
            dataLoader = DataLoader(dataset=processor, batch_size=BATCH_SIZE, collate_fn=my_collate, num_workers=2)

            idx2label = params['idx2label']
            self.bertCrf_model.eval()
            entities = []
            for i_batch, sample_batched in enumerate(dataLoader):
                inputs = sample_batched['input']

                bert_inputs, bert_attention_mask, bert_token_mask, wordpiece_alignment, split_alignments, lengths, token_mask \
                    = processor.tokens_totensor(inputs)

                _, preds = self.bertCrf_model.predict(input_ids=bert_inputs.to(device),
                                                      bert_attention_mask=bert_attention_mask.to(device),
                                                      bert_token_mask=bert_token_mask,
                                                      alignment=wordpiece_alignment,
                                                      splits=(split_alignments, lengths),
                                                      token_mask=token_mask)
                if idx2label:
                    for i, (path, score) in enumerate(preds):
                        labels = [idx2label[p] for p in path]
                        offset_index = i_batch * BATCH_SIZE + i
                        entities.append([[e.span[0], e.span[1], e.tag, e.text]
                                         for e in extract_entity(labels, spans[offset_index], text[offset_index])])
        return {'annotations': entities}


def load_data_processor(inputs):
    token_spans = []
    tokens = []
    for line in inputs:
        token_spans.append(list(tokenizer.span_tokenize(line)))
        tokens.append([line[start: end] for start, end in token_spans[-1]])

    processor = NERDatasetBatch.from_params(params=params, inputs=tokens)
    return processor, tokens, token_spans


def load_model():
    allowed_transitions = None
    model = BertCRF(num_tags=params['num_tags'],
                    model_name=params['model_name'],
                    stride=params['stride'],
                    include_start_end_transitions=True,
                    constraints=allowed_transitions)
    return model


def extract_entity(preds, spans, text, length=20):
    """
    extract entity from label sequence
    :param preds: a list of labels in a sentence
    :type preds: List[str
    :param spans:
    :type spans:
    :return: A list of entity object
    :rtype: List[Entity]
    """
    entities = []
    tmp = []

    for i, token in enumerate(preds):
        if token == 'O':
            pos, tag = 'O', 'O'
            label = None
        else:
            pos, tag = token.split('-')
            label = Entity_Label(index=i, pos=pos, tag=tag, span=spans[i])

        if pos in {'B', 'O'} and tmp:
            start_span = tmp[0].span[0]
            end_span = tmp[-1].span[1]
            entities.append(Entity(span=(start_span, end_span),
                                   tag=tmp[0].tag,
                                   text=text[start_span:end_span],
                                   pre=text[max(0, start_span - length):start_span],
                                   post=text[end_span: end_span + length]))
            tmp[:] = []
        if pos == 'B' or pos == 'I':
            tmp.append(label)

    if tmp:
        start_span = tmp[0].span[0]
        end_span = tmp[-1].span[-1]
        entities.append(
            Entity(span=(start_span, end_span),
                   tag=tmp[0].tag,
                   text=text[start_span:end_span],
                   pre=text[max(0, start_span - length):start_span],
                   post=text[end_span:end_span + length])
        )
    return entities


# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]


def clean_Nones(ner_tags_):
    ner_tags = []
    # had to do this as the position of entity tag and entity are exchanged in CD
    for each_ner_tag in ner_tags_:
        if 'CD' == each_ner_tag[2]:
            ner_tags.append([each_ner_tag[0], each_ner_tag[1], each_ner_tag[3], each_ner_tag[2]])
        else:
            ner_tags.append(each_ner_tag)

    ner_tags = sorted(ner_tags, key=lambda x: len(x[3]), reverse=True)
    if len(ner_tags) == 1 and 'None' in ner_tags:
        return ner_tags
    elif len(ner_tags) > 1 and 'None' in ner_tags:
        ner_tags.remove('None')
        return ner_tags
    else:
        return ner_tags


In [None]:
# This function will compare ml tags and ztags. The agreed tags are then returned back
def compare_ml_annotations_with_dictionary_tagged(ml_tags_, z_tags_):
    agreed_z_tags = set()
#     print(z_tags_, ml_tags_)
    for each_z_tag in z_tags_:
        for each_ml_annotation in ml_tags_:
            if each_z_tag.lower() in missing_list:
                agreed_z_tags.add(each_z_tag)
            else:
                score = fuzz.partial_ratio(each_ml_annotation, each_z_tag) #token_set_ratio
                if score > 80:
                    agreed_z_tags.add(each_z_tag)
    return agreed_z_tags

In [7]:
fulltext_scores={
    
'title':10,
'intro':1,
'result':5,
'discus':2,
'conclus':2,
'case':1,
'fig':5,
'table':5,
'appendix':1,
'other':1
}
# fulltext_scores


def assign_scores_to_sections(fulltext_scores_, section_tagged_):
    scores = []
    for key,val in fulltext_scores_.items():
        if key in section_tagged_.lower():
            return val

    return fulltext_scores_['other']

In [None]:
# get gp, ds, cd, and og sets
gp_set = set()
ds_set = set()
cd_set = set()
og_set = set()

In [8]:
# Read IDS OAs and RAs  Not usefu now. Use this when working at EuropePMC level
# id_database_df= pd.read_csv('/nfs/gns/literature/Santosh_Tirunagari/OTAR_ids_dataset/id_dataset.csv')
# FT_IDS = dict(zip(id_database_df['FT_ID'].values.tolist(), id_database_df['PUB_DATE'].values))
# id_database_df.sample(n=2)

# try:
#     pmc_id = 'PMC'+soup.find(attrs={"pub-id-type" : "pmcid"}).text
#     if pmc_id in FT_IDS:
#         pub_date = FT_IDS[pmc_id]
#         print(pmc_id, pub_date)
#     else:
#         print(pmc_id+ ' not found')
# except:
#     print(pmc_id+ ' not found')

In [6]:
# data_file_path = '/nfs/misc/literature/rdf_annotation_data/daily_pipeline_api/15_08_1947/fulltext/job_14/annotation/patch-total-417.xml.gz'

# # data_file_path = '/nfs/misc/literature/rdf_annotation_data/daily_pipeline_api/15_08_1947/abstract/job_21/annotation/patch-total-415.abstract.gz'

# def getfileblocks(file_path):
#     subFileBlocks = []

#     with gzip.open(file_path, 'rt') as fh: #gzip.open
#         for line in fh:
#             if line.startswith('<!DOCTYPE article'):  # <!DOCTYPE "JATS-archivearticle1.dtd">
#                 subFileBlocks.append(line)
#             else:
#                 subFileBlocks[-1] += line

#     return subFileBlocks

In [212]:
import io
data_folder_path = '/nfs/production/literature/Santosh_Tirunagari/20.11_FT_Chunks/'
data_file_path = data_folder_path+ 'Annot_PMC2111990_PMC2131188_split_99.xml'#'Annot_PMC6432232_PMC6447240_split_36.xml'


def getfileblocks(file_path):
    subFileBlocks = []

    with io.open(file_path, 'r', encoding='utf8') as fh:
        for line in fh:
            if line.startswith('<!DOCTYPE'):
                subFileBlocks.append(line)
            else:
                subFileBlocks[-1] += line

    return subFileBlocks


In [213]:
files_list = getfileblocks(data_file_path)

In [214]:
soup = BeautifulSoup(files_list[11], 'lxml')

In [215]:
soup

<!DOCTYPE >
<html><body><p>JATS-archivearticle1.dtd"&gt;
<ebiroot xmlns:z="ebistuff"><article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink"><?properties open_access?><?DTDIdentifier.IdentifierValue -//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN?><?DTDIdentifier.IdentifierType public?><?SourceDTD.DTDName archivearticle.dtd?><?SourceDTD.Version 2.3?><?ConverterInfo.XSLTName jp2nlmx2.xsl?><?ConverterInfo.Version 1?><front><journal-meta><journal-id journal-id-type="nlm-ta">J Exp Med</journal-id><journal-id journal-id-type="iso-abbrev">J. Exp. Med</journal-id><journal-title-group><journal-title>The Journal of Experimental Medicine</journal-title></journal-title-group><issn pub-type="ppub">0022-1007</issn><issn pub-type="epub">1540-9538</issn><publisher><publisher-name>The Rockefeller University Press</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">19869123

In [216]:
  
# publicate_date = datetime.datetime.strptime(publication_date,"%Y-%m-%d")
# print(publicate_date.date())

In [217]:
all_sentences = soup.find_all('sent')# all_sentences = soup.find_all('SENT')

In [218]:
try:
    abs_full = soup.find('abstract').text
    abs_sentences = soup.find('abstract').find_all('plain')
    total_abstract_length = len(abs_sentences)
except:
    abs_full =''
    abs_sentences =''
    

In [219]:
plain_sentences = []
section_tags = defaultdict(set)
evidence_scores = defaultdict(list)
gene_sentences = defaultdict(set)
disease_setences = defaultdict(set)
line_count = 0    
for each_sentence in all_sentences:
    extracted_sentence = each_sentence.plain
    
    if extracted_sentence:
        clean_text = unicodedata.normalize("NFKD",extracted_sentence.text).strip()

        try:
            title_tag = extracted_sentence.findParent('article-title')
        except:
            title_tag =''
                
        try:
#             section_tags[extracted_sentence.text].add(extracted_sentence.findParents('SecTag')[0]['type'])
            if title_tag:
                section_tags[clean_text].add('title')
                evidence_scores[clean_text].append(10)
            else:
                try:
                    if extracted_sentence in abs_sentences:
                        section_tagged = 'Abstract'
                    else:
                        section_tagged = extracted_sentence.findParent('sec').title.text.strip()
                    
                except:
                    section_tagged =''
                
                if section_tagged:
                    section_tags[clean_text].add(section_tagged)
                    # evidence scores
                    if 'abstract' in section_tagged.lower():
#                         print(line_count)
                        line_count = line_count+1
                        if line_count ==1 or line_count==2:
                            evidence_scores[clean_text].append(2)
                        elif line_count==total_abstract_length:
                            evidence_scores[clean_text].append(5)
                        else:
                            evidence_scores[clean_text].append(3)
                    else:
                        evi_scor = assign_scores_to_sections(fulltext_scores,section_tagged)
#                         print(section_tagged, evi_scor)
                        evidence_scores[clean_text].append(evi_scor)
                else:
                    evidence_scores[clean_text].append(1)               
        except:
#             section_tags[extracted_sentence.text].add('Other')
#             evidence_scores[extracted_sentence.text].append(1)
            pass
        
        plain_sentences.append(clean_text)
  

In [220]:
try:
    uniprot_ztags = soup.find_all('z:uniprot')
    for each_tag in uniprot_ztags:
        uniprot_set.add(each_tag.text)
except:
    print('no uniprot_ztags found ')

try:   
    efo_ztags = soup.find_all('z:efo')
    for each_tag in efo_ztags:
        efo_set.add(each_tag.text)
except:
    print('no efo_ztags found ')

In [221]:
for each_sentence,scores in evidence_scores.items():
    average_score = mean(scores)
    average_evidence_scores[each_sentence] = average_score
    
# average_evidence_scores

In [222]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

def get_all_tags(all_sentences):
    ML_annotations = ml_model.post(all_sentences)
    return ML_annotations['annotations']#list(filter(None, ML_annotations['annotations']))

In [223]:
annotations = get_all_tags(plain_sentences)

In [224]:
annotations

[[],
 [[66, 73, 'OG', 'rabbits'],
  [114, 121, 'OG', 'animals'],
  [127, 139, 'OG', 'Tr. pallidum']],
 [],
 [],
 [],
 [[130, 138, 'DS', 'syphilis'], [184, 191, 'OG', 'animals']],
 [[94, 102, 'DS', 'syphilis'], [266, 275, 'DS', 'infection']]]

In [226]:
final_annotations

[[],
 [[66, 73, 'OG', 'rabbits'],
  [114, 121, 'OG', 'animals'],
  [127, 139, 'OG', 'Tr. pallidum']],
 [],
 [],
 [],
 [[130, 138, 'DS', 'syphilis'], [184, 191, 'OG', 'animals']],
 [[94, 102, 'DS', 'syphilis'], [266, 275, 'DS', 'infection']]]

In [227]:
def roundoff(dict_y):
    for k, v in dict_y.items():
        v = round(v,2) 
        dict_y[k] = v 
    return dict_y


def get_new_missing_tags(each_sentence, missing_list_, tag_type):
    new_entities = []
    for missing_string in missing_list_:
        for i in re.finditer(missing_string, each_sentence.lower()):
            indexlocation= i.span()
    #         print(indexlocation)
            startindex= i.start()
            endindex= i.end()
            entity = each_sentence[indexlocation[0]:indexlocation[1]]
            new_entities.append([startindex,endindex, tag_type, entity])
    return new_entities

In [228]:
ml_tagged_sentences = defaultdict(list)
count = 0
for each_sentence in tqdm(plain_sentences):
#     rs = requests.post(url, data={'text': each_sentence})
#     ml_annotations = rs.json()['annotations']
    new_entities = get_new_missing_tags(each_sentence, missing_list, tag_type='DS')
    all_tags = new_entities+final_annotations[count] 

    if all_tags:
        ml_tagged_sentences[each_sentence] = all_tags
        for each_ml_tag in all_tags:
            if each_ml_tag[2] =='GP':
                gp_set.add(each_ml_tag[3])
            elif each_ml_tag[2] =='DS':
                ds_set.add(each_ml_tag[3])
            if each_ml_tag[2] =='CD':
                cd_set.add(each_ml_tag[3])
            if each_ml_tag[2] =='OG':
                og_set.add(each_ml_tag[3])

    count = count+1
    

100%|██████████| 7/7 [00:00<00:00, 18535.43it/s]


In [229]:
ml_tagged_sentences

defaultdict(list,
            {'Experiments are described in which the thyroid or thymus gland of rabbits was removed prior to inoculation of the animals with Tr. pallidum.': [[66,
               73,
               'OG',
               'rabbits'],
              [114, 121, 'OG', 'animals'],
              [127, 139, 'OG', 'Tr. pallidum']],
             'The effect of complete thymectomy was less pronounced than that of either complete or partial thyroidectomy, but, in general, the syphilis resembled that in partially thyroidectomized animals.': [[130,
               138,
               'DS',
               'syphilis'],
              [184, 191, 'OG', 'animals']],
             "These effects are discussed in relation to the host's reaction and resistance to experimental syphilis and the conclusion was reached that the integrity and balance of the glands of internal secretion play an important rôle in the mechanism of defense against this infection.": [[94,
               102,
            

In [230]:
# ML not recognising covid terms in some cases.

In [231]:
# This is for dictionary based method

# def get_article_offset_per_tag(soup, tagset):
#     dicttag = defaultdict(list)
   
    
#     for each_tag in tagset:
#         for i in re.finditer(each_tag, each_sentence.lower()):
#             indexlocation= i.span()
#     #         print(indexlocation)
#             startindex= i.start()
#             endindex= i.end()
#             entity = each_sentence[indexlocation[0]:indexlocation[1]]
#         dicttag[each_tag].append([each_sentence, each_ml_tag[0], each_ml_tag[1]])
#     return dicttag

# def compare_ml_annotations_with_dictionary_tagged(ml_tags_, z_tags_):
#     agreed_z_tags = set()
# #     print(z_tags_, ml_tags_)
#     for each_z_tag in z_tags_:
#         for each_ml_annotation in ml_tags_:
#             if each_z_tag.lower() in missing_list:
#                 agreed_z_tags.add(each_z_tag)
#             else:
#                 score = fuzz.partial_ratio(each_ml_annotation, each_z_tag) #token_set_ratio
#                 if score > 80:
#                     agreed_z_tags.add(each_z_tag)
#     return list(z_tags_- agreed_z_tags)

# all_ml_gp_tags = set()
# all_ml_ds_tags = set()
# all_gp_ztags = set()
# all_ds_ztags = set()

# for each_sentence, gp_z_tags in gene_sentences.items():
#     ml_tags = ml_tagged_sentences[each_sentence]
#     print(ml_tags)
#     for each_ml_tag in ml_tags:
#         if each_ml_tag[2] =='GP':
#             all_ml_gp_tags.add(each_ml_tag[3])
#     for each_gp_ztag in gp_z_tags:
#         all_gp_ztags.add(each_gp_ztag)
    
# for each_sentence, ds_z_tags in disease_setences.items():
#     ml_tags = ml_tagged_sentences[each_sentence]
#     for each_ml_tag in ml_tags:
#         if each_ml_tag[2] =='DS':
#             all_ml_ds_tags.add(each_ml_tag[3])
#     for each_ds_ztag in ds_z_tags:
#         all_ds_ztags.add(each_ds_ztag)  

# GP_false_postives = compare_ml_annotations_with_dictionary_tagged(all_ml_gp_tags, all_gp_ztags)
# GP_false_postives

# DS_false_postives = compare_ml_annotations_with_dictionary_tagged(all_ml_ds_tags, all_ds_ztags)
# DS_false_postives

In [233]:
uniprot_fp_removed_set = compare_ml_annotations_with_dictionary_tagged(gp_set, uniprot_set)
uniprot_fp_removed_set, efo_set, cd_set

(set(),
 {'infection', 'syphilis'},
 {'Creatinine',
  'Spironolacton',
  'Uric acid',
  'aldosterone',
  'bilirubin',
  'creatinine',
  'ecadotril',
  'sildenafil',
  'thiazides',
  'urea',
  'uric acid'})

In [234]:
ztag_sentences = {}
for each_sentence in tqdm(plain_sentences):
    uniport_entities = get_new_missing_tags(each_sentence, uniprot_fp_removed_set, tag_type='GP')
    efo_entities = get_new_missing_tags(each_sentence, efo_set, tag_type='DS')
    cd_entities = get_new_missing_tags(each_sentence, cd_set, tag_type='CD')
    all_tags = uniport_entities+efo_entities+cd_entities
    if all_tags:
        ztag_sentences[each_sentence]= all_tags


100%|██████████| 7/7 [00:00<00:00, 11362.28it/s]


In [235]:
ztag_sentences

{'EFFECT OF THYROIDECTOMY AND OF THYMECTOMY IN EXPERIMENTAL SYPHILIS OF THE RABBIT': [[58,
   66,
   'DS',
   'SYPHILIS']],
 'The effect of complete thymectomy was less pronounced than that of either complete or partial thyroidectomy, but, in general, the syphilis resembled that in partially thyroidectomized animals.': [[130,
   138,
   'DS',
   'syphilis']],
 "These effects are discussed in relation to the host's reaction and resistance to experimental syphilis and the conclusion was reached that the integrity and balance of the glands of internal secretion play an important rôle in the mechanism of defense against this infection.": [[94,
   102,
   'DS',
   'syphilis'],
  [266, 275, 'DS', 'infection']]}

In [236]:
# this will get matches
def get_sentences_matches_tags(sentences_tags):
    matches = defaultdict(list)
    for each_sentence, ml_tags in sentences_tags.items():
        for each_ml_tag in ml_tags:
            if each_ml_tag[2]!= 'OG':
                mini_dict = {}
                mini_dict['label'] = each_ml_tag[3]
                mini_dict['type'] = each_ml_tag[2]
                mini_dict['startInSentence'] = each_ml_tag[0]
                mini_dict['endInSentence'] = each_ml_tag[1]
                if each_sentence in abs_full:
                    start_index = abs_full.find(each_sentence)
                    mini_dict['sectionStart'] = start_index
                    mini_dict['sectionEnd'] = start_index + len(each_sentence)
                matches[each_sentence].append(mini_dict)
    
    return matches

# map annotations in sets of pairs
def get_mapped_list_from_annotations(annotation_list):
    mapped_list = list(itertools.combinations(annotation_list, 2))

    unique_maplist = []
    for each_list in mapped_list:
        if each_list[0][2] !=each_list[1][2] and each_list[1][2]!='OG' and each_list[0][2]!='OG':
            unique_maplist.append((each_list[0], each_list[1]))

    return unique_maplist  

# get only those sentences with relevant pairs
def get_sentences_offset_per_cooccurance(sentences_tags):
    
    dict_gp_ds = defaultdict(list)
    dict_gp_cd = defaultdict(list)
    dict_ds_cd = defaultdict(list)

    for sentence, tags in sentences_tags.items():
        if len(tags)>1: # only if more than 1 tag is available
            check_tags =np.array(tags)
            if 'GP' in  check_tags and 'DS' in check_tags:
                dict_gp_ds[sentence] = get_mapped_list_from_annotations(tags)
            if 'GP' in  check_tags and 'CD' in check_tags:
                dict_gp_cd[sentence] = get_mapped_list_from_annotations(tags)
            if 'DS' in  check_tags and 'CD' in check_tags:
                dict_ds_cd[sentence]= get_mapped_list_from_annotations(tags)         
                
    return dict_gp_ds, dict_gp_cd, dict_ds_cd


# if not in the right position if the pair and swap them such that always GP is followed by either CD or DS and DS is followed by CD
def swap_positions(cooccurance_list, pos1, pos2): 
    cooccurance_list[pos1], cooccurance_list[pos2] = cooccurance_list[pos2], cooccurance_list[pos1]
    return cooccurance_list    
    


In [237]:
# xx_ = ' '.join(plain_sentences[1:])

In [238]:
# this is for getting relationship text
def get_relations(gp_ds_text_sentence):
    docs = relation_model2(gp_ds_text_sentence)
    rel_list =[]
    for ent in docs.ents:
        if ent.label_!='GP' and ent.label_!='DS':
            rel_dict = {}
            rel_dict['startr'] = ent.start_char
            rel_dict['endr'] = ent.end_char
            rel_dict['labelr'] = ent.text
            rel_dict['typer'] = ent.label_
            rel_list.append(rel_dict)
    return rel_list

In [239]:
def get_cooccurance_evidence(dict_tags, tag_type_1, tag_type_2):
    co_occurance_sentences = defaultdict(list)
#     mined_sentences = []
    for each_sent_map, mappedtags in dict_tags.items():
        # always see that GP-DS, GP-CD and CD-DS is followed
        
        if tag_type_1 not in mappedtags[0][0][2]:
            mappedtags[0] = swap_positions(list(mappedtags[0]),0,1)
        else:
            mappedtags[0] = list(mappedtags[0])

        for eachtag in mappedtags:
            if tag_type_1==eachtag[0][2] and tag_type_2==eachtag[1][2]:
                mini_dict = {}
                mini_dict['start1'] = eachtag[0][0]
                mini_dict['end1']= eachtag[0][1]
                mini_dict['label1']= eachtag[0][3]
                mini_dict['start2'] = eachtag[1][0]
                mini_dict['end2']= eachtag[1][1]
                mini_dict['label2']= eachtag[1][3]
                mini_dict['type'] = tag_type_1+'-'+tag_type_2
                
                if average_evidence_scores[each_sent_map]:
                    mini_dict['evidence_score'] = average_evidence_scores[each_sent_map]
                else:
                    mini_dict['evidence_score'] = 1
                    
                if tag_type_1=='GP' and tag_type_2=='DS':
                    # get associations scores
                    mini_dict['association'] = roundoff(relation_model1(each_sent_map).cats)
                    # get relations
                    rels = get_relations(each_sent_map)
                    if rels:
                        mini_dict['relation'] = rels
                co_occurance_sentences[each_sent_map].append(mini_dict)
    return co_occurance_sentences


In [240]:
ml_gp_ds, ml_gp_cd, ml_ds_cd = get_sentences_offset_per_cooccurance(ml_tagged_sentences)
ztag_gp_ds, ztag_gp_cd, ztag_ds_cd = get_sentences_offset_per_cooccurance(ztag_sentences)

In [241]:
ml_co_occurance_gp_ds = get_cooccurance_evidence(ml_gp_ds, tag_type_1='GP', tag_type_2='DS')
ml_co_occurance_gp_cd = get_cooccurance_evidence(ml_gp_cd, tag_type_1='GP', tag_type_2='CD')
ml_co_occurance_ds_cd = get_cooccurance_evidence(ml_ds_cd, tag_type_1='DS', tag_type_2='CD')


ztag_co_occurance_gp_ds = get_cooccurance_evidence(ztag_gp_ds, tag_type_1='GP', tag_type_2='DS')
ztag_co_occurance_gp_cd = get_cooccurance_evidence(ztag_gp_cd, tag_type_1='GP', tag_type_2='CD')
ztag_co_occurance_ds_cd = get_cooccurance_evidence(ztag_ds_cd, tag_type_1='DS', tag_type_2='CD')

In [242]:
ml_match_gp_ds_cd = get_sentences_matches_tags(ml_tagged_sentences)
ztag_match_gp_ds_cd = get_sentences_matches_tags(ztag_sentences)

In [243]:
section_tags

defaultdict(set,
            {'EFFECT OF THYROIDECTOMY AND OF THYMECTOMY IN EXPERIMENTAL SYPHILIS OF THE RABBIT': {'title'},
             'Experiments are described in which the thyroid or thymus gland of rabbits was removed prior to inoculation of the animals with Tr. pallidum.': {'Abstract'},
             'The effect of these procedures is described from the standpoint of the manifestations of the disease.': {'Abstract'},
             'After complete thyroidectomy, the disease was considerably more severe than in the controls and very markedly so in certain instances.': {'Abstract'},
             'Partial tyroidectomy, on the other hand, resulted in a milder disease than that of the controls.': {'Abstract'},
             'The effect of complete thymectomy was less pronounced than that of either complete or partial thyroidectomy, but, in general, the syphilis resembled that in partially thyroidectomized animals.': {'Abstract'},
             "These effects are discussed in relation to 

In [244]:
def generate_interested_sentences_in_json_format(final_sentences, match_gp_ds_cd, co_occurance_gp_ds,co_occurance_gp_cd,co_occurance_ds_cd):
    interested_sentences=[]
    for each_sentence, tags in final_sentences.items():
        minidict = {}

        minidict['text'] = each_sentence

        if section_tags[each_sentence]:
            minidict['section'] = list(section_tags[each_sentence])[0]
        else:
            minidict['section'] = 'Other'

        all_matches = match_gp_ds_cd[each_sentence]

        if all_matches:
            minidict['matches'] = all_matches

        all_co_occurances = co_occurance_gp_ds[each_sentence] + co_occurance_gp_cd[each_sentence]+co_occurance_ds_cd[each_sentence]

        if all_co_occurances:
            minidict['co-occurrence'] = all_co_occurances
        if all_co_occurances or all_matches:
            interested_sentences.append(minidict)
    
    return interested_sentences

In [245]:
ml_json_generated = {}

try:
    ml_json_generated['pmid'] = 'PMC'+soup.find(attrs={"pub-id-type" : "pmcid"}).text
except:
    ml_json_generated['pmid'] = soup.find(attrs={"pub-id-type" : "pmid"}).text
    
    
try:
    ml_json_generated['pubDate'] = soup.find('pub-date').year.text+'-'+soup.find('pub-date').month.text+'-'+soup.find('pub-date').day.text
except:
     ml_json_generated['pubDate'] = '' 


ml_json_generated['organisms'] = list(og_set)

ml_interested_sentences = generate_interested_sentences_in_json_format(ml_tagged_sentences, ml_match_gp_ds_cd, ml_co_occurance_gp_ds, ml_co_occurance_gp_cd, ml_co_occurance_ds_cd)
ml_json_generated['sentences'] = ml_interested_sentences

In [246]:
ztag_json_generated = {}

try:
    ztag_json_generated['pmid'] = 'PMC'+soup.find(attrs={"pub-id-type" : "pmcid"}).text
except:
    ztag_json_generated['pmid'] = soup.find(attrs={"pub-id-type" : "pmid"}).text
    
    
try:
    ztag_json_generated['pubDate'] = soup.find('pub-date').year.text+'-'+soup.find('pub-date').month.text+'-'+soup.find('pub-date').day.text
except:
     ztag_json_generated['pubDate'] = '' 


ztag_json_generated['organisms'] = list(og_set)

ztag_interested_sentences = generate_interested_sentences_in_json_format(ztag_sentences, ztag_match_gp_ds_cd, ztag_co_occurance_gp_ds, ztag_co_occurance_gp_cd, ztag_co_occurance_ds_cd)
ztag_json_generated['sentences'] = ztag_interested_sentences

In [247]:
import json
json_object = json.dumps(ml_json_generated, indent = 4, ensure_ascii=False)  
print(json_object)  

{
    "pmid": "19869123",
    "pubDate": "1926-2-28",
    "organisms": [
        "animals",
        "rabbits",
        "Tr. pallidum"
    ],
    "sentences": [
        {
            "text": "The effect of complete thymectomy was less pronounced than that of either complete or partial thyroidectomy, but, in general, the syphilis resembled that in partially thyroidectomized animals.",
            "section": "Abstract",
            "matches": [
                {
                    "label": "syphilis",
                    "type": "DS",
                    "startInSentence": 130,
                    "endInSentence": 138,
                    "sectionStart": 479,
                    "sectionEnd": 671
                }
            ]
        },
        {
            "text": "These effects are discussed in relation to the host's reaction and resistance to experimental syphilis and the conclusion was reached that the integrity and balance of the glands of internal secretion play an important rô

In [248]:
json_object = json.dumps(ztag_json_generated, indent = 4, ensure_ascii=False)  
print(json_object)  

{
    "pmid": "19869123",
    "pubDate": "1926-2-28",
    "organisms": [
        "animals",
        "rabbits",
        "Tr. pallidum"
    ],
    "sentences": [
        {
            "text": "EFFECT OF THYROIDECTOMY AND OF THYMECTOMY IN EXPERIMENTAL SYPHILIS OF THE RABBIT",
            "section": "title",
            "matches": [
                {
                    "label": "SYPHILIS",
                    "type": "DS",
                    "startInSentence": 58,
                    "endInSentence": 66
                }
            ]
        },
        {
            "text": "The effect of complete thymectomy was less pronounced than that of either complete or partial thyroidectomy, but, in general, the syphilis resembled that in partially thyroidectomized animals.",
            "section": "Abstract",
            "matches": [
                {
                    "label": "syphilis",
                    "type": "DS",
                    "startInSentence": 130,
                    "endInS

In [None]:
# https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID%3A2109978%20AND%20SRC%3AMED&resultType=core&cursorMark=*&pageSize=25&format=json

import requests
query = '2109978'
url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+query+' AND SRC:MED&resultType=core&cursorMark=*&pageSize=25&format=json'
response = requests.get(url)
rjson = response.json()

In [None]:
rjson['resultList']['result'][0]['abstractText']

In [None]:
xx_

In [None]:
xx_==yy_

# FP Analysis on test set

In [47]:
import requests

url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/PMC520822/fullTextXML'

r = requests.get(url)

In [48]:
ss= BeautifulSoup(r.text, 'lxml')

In [49]:
ss

<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<html><body><article article-type="research-article" xmlns:xlink="http://www.w3.org/1999/xlink"><front><journal-meta><journal-id journal-id-type="nlm-ta">Respir Res</journal-id><journal-title>Respiratory Research</journal-title><issn pub-type="ppub">1465-9921</issn><issn pub-type="epub">1465-993X</issn><publisher><publisher-name>BioMed Central</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">1465-9921-5-11</article-id><article-id pub-id-type="pmid">15377396</article-id><article-id pub-id-type="doi">10.1186/1465-9921-5-11</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research</subject></subj-group></article-categories><title-group><article-title>Proinflammatory role of inducible nitric oxide synthase in acute hyperoxic lung injury</article-title></title-group><contrib-group><contrib contrib-ty

In [50]:
# ss.find_all('p')[0].findParent('abstract').title.text
ss.find_all('p')[1].findParent('sec').title.text

'Background'

In [123]:
ss.find_all('p')[49].text

'BM HSC could be lentivirally modified with long-term transgene expression without compromising cell differentiation or function as also shown in PB HSC [63]. Recently, lentivirally mediated gene therapy was shown to provide clinical benefits in an inheritable fatal demyelinating disease X-linked adrenoleukodystrophy [64]. HSC were transduced ex vivo to restore dysfunctional protein function and proper myelin maintenance and transplanted after myeloablative treatment. Even though less than 15% of leucocytes expressed the transgene, the progressive cerebral demyelination ceased. This suggests that introducing genetically modified leucocytes may work as a therapeutic approach, potentially also for other diseases. Microglia [14] and monocyte function [40,65] may be defective in AD. Introducing fresh monocytic cells into the circulation, whether migrating into the brain and acting as BM-derived microglia in the CNS or circulating in the periphery, may contribute to inflammatory activities 

In [124]:
doc = relation_model1(ss.find_all('p')[49].text)

In [125]:
list(doc.sents)

[BM HSC could be lentivirally modified with long-term transgene expression without compromising cell differentiation or function as also shown in PB HSC [63].,
 Recently, lentivirally mediated gene therapy was shown to provide clinical benefits in an inheritable fatal demyelinating disease X-linked adrenoleukodystrophy [64].,
 HSC were transduced ex vivo to restore dysfunctional protein function and proper myelin maintenance and transplanted after myeloablative treatment.,
 Even though less than 15% of leucocytes expressed the transgene, the progressive cerebral demyelination ceased.,
 This suggests that introducing genetically modified leucocytes may work as a therapeutic approach, potentially also for other diseases.,
 Microglia [14] and monocyte function [40,65] may be defective in AD.,
 Introducing fresh monocytic cells into the circulation, whether migrating into the brain and acting as BM-derived microglia in the CNS or circulating in the periphery, may contribute to inflammatory

In [None]:
from ast import literal_eval

def deleting_epmc_GPS(list_1,del_name):
   
    for sub_list in list_1:
        if del_name in sub_list:
            list_1.remove(sub_list)
    return list_1

In [None]:
from fuzzywuzzy import fuzz

def remove_FP(epmc_list, ml_json):
    all_ml_gps = []
    if ml_json['annotations']:
        for each_ml_annotation in ml_json['annotations']:
            if each_ml_annotation[2] == 'GP':
                all_ml_gps.append(each_ml_annotation[3])
    
    non_FP_removed =[]
              

    for each_ner in epmc_list:
        if each_ner[2] == 'Gene_Proteins':
            for each_ml_gp in all_ml_gps:         
                score = fuzz.token_set_ratio(each_ml_gp, each_ner[1])
                if score == 100:
                    non_FP_removed.append(each_ner)

    non_gp_tags =  deleting_epmc_GPS(epmc_list,'Gene_Proteins')  

    fp_removed_tags = non_gp_tags+non_FP_removed

    return fp_removed_tags


In [None]:
def get_spacy_annotations(text_sentence):
    data_dict ={}
    doc = nlp2(text_sentence)
    terms_entities = []
    
    for ent in doc.ents:
        terms_entities.append(
            [ent.start_char, ent.end_char, ent.label_, ent.text])
    
    data_dict['annotations'] = terms_entities
    
    return data_dict

In [None]:
# from tqdm import tqdm
with open(result_path + 'spacy_fp_removal_80.tsv', 'w', newline='\n') as f1:
    public_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')
    
    for index,row in tqdm(test_df.iterrows(),total = len(test_df)):
        try:
            ml_annotations = get_spacy_annotations(row['sentence'])
            fp_removed = remove_FP(literal_eval(row['ner']), ml_annotations)
        except ValueError:
            fp_removed =''
            
        public_writer.writerow([row['pmc_id'], row['section'],row['sentence'], fp_removed])   
            
    

In [None]:
# Convert to IOB format

import sys
import os
import glob

sys.path.append('/mnt/droplet/nfs/gns/literature/Santosh_Tirunagari/test Gitlab/epmc-ml-misc-library/')

import capo_tools_lib
import evaluation_epmc_lib




In [None]:
iob_result_path = result_path+'iob/'
pathlib.Path(iob_result_path).mkdir(parents=True, exist_ok=True)

file_path = result_path + 'spacy_fp_removal_80.tsv'
capo_tools_lib.annotations_api_tagged_sentences_to_IOB(file_path,
                                                       iob_result_path,'spacy_fp_removal_iob.tsv')

In [None]:
import metrics.ner as ner_metrics


# precision	0.7	0.7	0.72	0.73
# recall	0.53	0.53	0.54	0.55
# f1 score	0.6	0.6	0.62	0.6

#
# print(ner_metrics.semeval_scores_report(gold=epmc_labels, response=ml_labels, digits=2))

root_path = '/mnt/droplet/nfs/gns/literature/machine-learning/'
epmc_path = root_path+'Datasets/NER_Datasets/EBI_standard-IOB/test.csv'
all_tags = ['GP', 'DS', 'OG']

print('################ Annotation Pipeline Results ########################')
CAPO_path = iob_result_path+'spacy_fp_removal_iob.tsv'
for each_tag in all_tags:
    print('############ '+each_tag+' ####################')
    print('\n')
    print(ner_metrics.semeval_report(gold_path=epmc_path, response_path=CAPO_path, targets=[each_tag]))