In [1]:
# IMPORTS
from termite_toolkit import termite 
from os import listdir
from pprint import pprint
import xml.etree.ElementTree as ET
import json

# VARIABLES
termite_home = "https://termite.scibite-mvp.nonprod.entellect.com/termite"
sdxml = [f for f in listdir() if f.endswith(".xml")]
core_entities = "ANAT,BIOCHEM,BIOPROC,CELLLINE,CELLTYP,COMPANY,COUNTRY,DBSNP,DRUG,GENE,GOONTOL,INDICATION,MEASURE,MIRNA,MOA,PROTYP,SPECIES"
options = {"format": "node.xml", "output": "json", "entities": core_entities}

In [2]:
ns = {'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
      'ja': 'http://www.elsevier.com/xml/ja/dtd',
     'ce' :'http://www.elsevier.com/xml/common/dtd'}


def xpath2string(value, method='normal'):
        if method == 'normal':
            try:
                valuelist = []
                for v in list(value):
                    valuelist.append(v.text)
                return " : ".join(valuelist)
            except TypeError:
                return ''
        elif method == 'greedy':
            try:
                valuelist = []
                for v in list(value):
                    valuelist.append(ET.tostring(v, method='text').decode("utf-8"))
                    #valuelist.append("".join(map(chr, ET.tostring(v, method='text'))))
                return " ".join(valuelist)
            except AttributeError:
                return ''
        elif method == 'names':
            valuelist = []
            for v in list(value):
                surname_path = v.find('./ce:surname', ns)
                forename_path = v.find('./ce:given-name', ns)
                valuelist.append(dict(surname=surname_path.text, forename=forename_path.text))
            return(valuelist)
        
def tag_using_etree():
    termite_instance = termite.TermiteRequestBuilder()
    termite_instance.set_url(termite_home)
    termite_instance.set_input_format("xml")
    termite_instance.set_entities(core_entities)
    count  = 0
    for fn in sdxml:
        """
        if count > 1:
            break
            """
        count +=1
        metadata = {}
        tree = ET.parse(fn)
        root = tree.getroot()
        jnl_path = tree.findall('.//xocs:srctitle', ns)
        title_path = tree.findall('.//ce:title', ns)
        abstract_path = tree.findall('.//ce:abstract-sec/ce:simple-para', ns)
        affiliation_path = tree.findall('.//ce:affiliation/ce:textfn', ns)
        pubdate_path = tree.findall('.//xocs:meta/xocs:available-online-date', ns)
        author_path = tree.findall('.//ce:author-group/ce:author', ns)
        doi_path = tree.findall('.//xocs:meta/xocs:doi',ns)
        issn_path = tree.findall('.//xocs:issn-primary-unformatted',ns)
        
        metadata['pii'] = fn[:-4]
        metadata['authors'] = xpath2string(author_path, method='names')
        metadata['published'] = xpath2string(pubdate_path)
        metadata['affiliations'] = xpath2string(affiliation_path)
        metadata['abstract'] = xpath2string(abstract_path, method='greedy')
        metadata['title'] = xpath2string(title_path, method='greedy')
        metadata['journal'] = xpath2string(jnl_path)
        metadata['doi'] = xpath2string(doi_path)
        metadata['issn'] = xpath2string(issn_path)

        #TODO explore whether we should come in at serial-article and not root
        xmlstr = ET.tostring(root, encoding='utf8', method='xml')
        termite_instance.set_text(xmlstr)
        termite_response = termite_instance.execute()
        payload = termite_response['RESP_MULTIDOC_PAYLOAD']['text']
        metadata['entities'] = []
        for entityType, entitylist in payload.items():
            for e in entitylist:
                wanted_keys = ['entityType', 'hitCount', 'name', 'hitID', 'taxon', 'frag_vector_array']
                output = dict((k, e[k]) for k in wanted_keys if k in e)
                metadata['entities'].append(output)
        
        with open(fn[:-4] + '.json', 'w') as outfile:
            json.dump(metadata, outfile, indent=4)

tag_using_etree()