# FAIR Text Metadata generator
## Processes XML files in current directory to generate JSON medatata
### VERSION 1.0
### DATE 5 August 2021
### AUTHOR Tim Miller

In [1]:
# IMPORTS
from termite_toolkit import termite 
from os import listdir
from pprint import pprint
import xml.etree.ElementTree as ET
import json

# VARIABLES
termite_home = "https://termite.scibite-mvp.nonprod.entellect.com/termite"
sdxml = [f for f in listdir() if f.endswith(".xml")]
core_entities = "ANAT,BIOCHEM,BIOPROC,CELLLINE,CELLTYP,COMPANY,DBSNP,DRUG,GENE,GOONTOL,INDICATION,MEASURE,MIRNA,MOA,PROTYP,SPECIES"
options = {"format": "any.xml", "output": "json", "entities": core_entities}

In [2]:
def subset_dict(wanted_keys, input_dict):
    output_dict = []
    for e in input_dict:
        output_dict.append(dict((k.title(), e[k]) for k in wanted_keys if k in e))
    return output_dict

In [3]:
ns = {'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
      'ja': 'http://www.elsevier.com/xml/ja/dtd',
     'ce' :'http://www.elsevier.com/xml/common/dtd'}


def xpath2string(value, method='normal'):
        if method == 'normal':
            try:
                valuelist = []
                for v in list(value):
                    valuelist.append(v.text)
                return " : ".join(valuelist)
            except TypeError:
                return ''
        elif method == 'greedy':
            try:
                valuelist = []
                for v in list(value):
                    valuelist.append(ET.tostring(v, method='text').decode("utf-8"))
                    #valuelist.append("".join(map(chr, ET.tostring(v, method='text'))))
                return " ".join(valuelist)
            except AttributeError:
                return ''
        elif method == 'names':
            valuelist = []
            for v in list(value):
                surname_path = v.find('./ce:surname', ns)
                forename_path = v.find('./ce:given-name', ns)
                valuelist.append(dict(surname=surname_path.text, forename=forename_path.text))
            return(valuelist)
        
def process_taxon(taxon):
    """
    Tidy up the output of TERMite for hierarchies
    """
    treeset = []
    for taxo, tree in taxon.items():
        treeset.extend([dict(TreePathCode=k, TreePath=v) for k,v in tree.items()])
    return treeset
        
def tag_using_etree():
    termite_instance = termite.TermiteRequestBuilder()
    termite_instance.set_url(termite_home)
    termite_instance.set_input_format("xml")
    termite_instance.set_entities(core_entities)
    count  = 0
    for fn in sdxml:
        """
        if count > 1:
            break
            """
        count +=1
        metadata = {}
        tree = ET.parse(fn)
        root = tree.getroot()
        jnl_path = tree.findall('.//xocs:srctitle', ns)
        title_path = tree.findall('.//ce:title', ns)
        abstract_path = tree.findall('.//ce:abstract-sec/ce:simple-para', ns)
        affiliation_path = tree.findall('.//ce:affiliation/ce:textfn', ns)
        pubdate_path = tree.findall('.//xocs:meta/xocs:available-online-date', ns)
        author_path = tree.findall('.//ce:author-group/ce:author', ns)
        doi_path = tree.findall('.//xocs:meta/xocs:doi',ns)
        issn_path = tree.findall('.//xocs:issn-primary-unformatted',ns)
        cid_path = tree.findall('.//xocs:meta/xocs:cid',ns)
        
        metadata['PII'] = fn[:-4]
        metadata['Authors'] = xpath2string(author_path, method='names')
        metadata['PublicationDate'] = xpath2string(pubdate_path)
        metadata['Affiliations'] = xpath2string(affiliation_path)
        metadata['Abstract'] = xpath2string(abstract_path, method='greedy')
        metadata['Title'] = xpath2string(title_path, method='greedy')
        metadata['JournalTitle'] = xpath2string(jnl_path)
        metadata['DOI'] = xpath2string(doi_path)
        metadata['ISSN'] = xpath2string(issn_path)
        metadata['CID'] = xpath2string(cid_path)
        termite_response = termite.annotate_files(termite_home, fn, options)
        payload = termite_response['RESP_MULTIDOC_PAYLOAD'][fn]
        metadata['Entities'] = []
        for entityType, entitylist in payload.items():
            for e in entitylist:
                wanted_keys = ['entityType', 'hitCount', 'name', 'hitID', 'taxon', 'exact_array', 'sectionMeta']
                wanted_key_names = {'entityType': 'EntityType', 'hitCount': 'Frequency', 'name':'PreferredName', 'hitID':'ID', 'taxon':'HigherTerms', 'exact_array':'Locations', 'sectionMeta':'XPaths'}
                output = dict((wanted_key_names[k], e[k]) for k in wanted_keys if k in e)
                output['Locations'] = subset_dict(['sentence', 'start', 'end'], output['Locations'])
                output['XPaths'] = [dict(Offset=x, Path=y[:-1]) for x,y in output['XPaths'].items()]
                sections = []
                for section in output['XPaths']:
                    #print (section['Path'])
                    if 'reference' in section['Path']:
                        sections.append('REFERENCES')
                    elif 'title' in section['Path']:
                        sections.append('TITLE')
                    elif 'abstract' in section['Path']:
                        sections.append('ABSTRACT')
                    elif 'article' in section['Path']:
                        sections.append('ARTICLE')
                    #print (sections)
                sections = list(set(sections))

                output['Sections'] = sections
                
                try:
                    output['HigherTerms'] = process_taxon(output['HigherTerms'])
                except KeyError:
                    pass
                
                metadata['Entities'].append(output)
        
        with open(fn[:-4] + '.json', 'w') as outfile:
            json.dump(metadata, outfile, indent=4)

tag_using_etree()