In [1]:
# This script will process patch files to remove GP FPs in job folders on OTAR FullTextLoadings
# (c) EMBL-EBI, Jan 2020
#
# Started: 15 Sept 2020
# Updated: 13 Oct  2020

_author_ = 'Santosh Tirunagari'

import glob
import gzip
from bs4 import BeautifulSoup
import lxml
from collections import defaultdict
from tqdm import tqdm
import requests
import random
import sys
import pathlib
import json


In [2]:
import argparse

# import multiprocessing
from fuzzywuzzy import fuzz

result_path = '/nfs/production/literature/Santosh_Tirunagari/GP_DS_jsonl_Extracted/'

pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)

def getfileblocks(file_path):
    subFileBlocks = []

    with open(file_path, 'r') as fh:
        for line in fh:
            if line.startswith('<!DOCTYPE'):
                subFileBlocks.append(line)
            else:
                subFileBlocks[-1] += line

    return subFileBlocks

In [3]:
def get_GP_tags(file_soup):
    gene_tags =[]
    for each_ztag in file_soup.find_all('z:uniprot'):
        gene_tags.append(each_ztag.text)
    return list(set(gene_tags))

In [4]:
def get_DS_tags(file_soup):
    disease_tags = []
    for each_ztag in file_soup.find_all('z:efo'):
        disease_tags.append(each_ztag.text)
    return list(set(disease_tags))

In [5]:
file_path = '/nfs/production/literature/shyama/FullText20.09/ML_FP/Annot_PMC13900_PMC548680.xml'

In [6]:
def process_each_file_in_job(each_file_path):
    ss = getfileblocks(each_file_path)
    article_tags_dict ={}
    
    for each_article in tqdm(ss):
        soup = BeautifulSoup(each_article, 'lxml')
        
        try:
            pm_id = 'PMC'+soup.find(attrs={"pub-id-type" : "pmcid"}).text #"article-id"
        except:
            pm_id = 'PMC'
        
        GP_list = get_GP_tags(soup)
        DS_list = get_DS_tags(soup)
        
        article_tags_dict['pmid'] = pm_id
        article_tags_dict['GP'] = GP_list
        article_tags_dict['DS'] = DS_list

        with open(result_path+'tags_'+each_file_path.split('/')[-1][:-3]+'jsonl', 'at') as f:
            json.dump(article_tags_dict,f) #,indent = 2
            f.write('\n')

In [7]:
process_each_file_in_job(file_path)

100%|██████████| 8526/8526 [05:35<00:00, 25.40it/s]
