In [1]:
import pandas as pd
import itertools
import numpy as np
import json
import regex as re

In [2]:
def convertJsonLFull(inputPath):
    df = pd.DataFrame(columns=['word','pred_open','pred_close','xml','pred_rel'])
    with open(inputPath,'r') as f:
        result = [json.loads(jline) for jline in f.read().splitlines()]
        for line in result:
            document = [line["doc_key"]]
            wordsList = line["sentences"]
            dfLine = createDf(document,wordsList)
            predictionLists = line["predicted_ner"]
            dfLine = addPredNERToDf(dfLine,predictionLists)
            predRelLists = line["predicted_relations"]
            dfLine = addPredRelToDf(dfLine,predRelLists)
            df = df.append(dfLine, ignore_index=True)
    df = df.drop(df.index[df['word'] == "\n"])
    df = df.drop(df.index[df['word'] == " "])
    df = df.reset_index(drop=True)
    
    # put the closing labels in the right order
    df = df.replace('geogName>','zgeogName>', regex=True)
    # df['pred_open'] = df['pred_open'].map(lambda x: ''.join(sorted(x.split(','),reverse=True)))
    df['pred_close'] = df['pred_close'].map(lambda x: ''.join(sorted(x.split(','))))
    df = df.replace('zgeogName>','geogName>', regex=True)
    
    # put the opening labels in the right order
    df = df.replace('<name','<zname', regex=True)
    df = df.replace('<geogName','<aageogName', regex=True)
    df = df.replace('<geogFeat','<bgeogFeat', regex=True)
    df['pred_open'] = df['pred_open'].map(lambda x: ''.join(sorted(x.split(','))))
    # df['pred_close'] = df['pred_close'].map(lambda x: ''.join(sorted(x.split(','),reverse=True)))
    df = df.replace('<zname','<name', regex=True)
    df = df.replace('<aageogName','<geogName', regex=True)
    df = df.replace('<bgeogFeat','<geogFeat', regex=True)
    
    # fill in the xml column by wrapping the words with their corresponding tags
    df['xml'] = df['pred_open'] + df['word'] + df['pred_close']
    
    # get rid of the surplus first </doc> and add <root> for ENTS
    v = df.at[0,'xml']
    v2 = v.replace("</doc>","\n<root>\n")
    df.at[0,'xml'] = v2
    
    # add a last </doc> to end the document and add </root> for ENTS
    df.at[df.index[-1],'xml'] = df.at[df.index[-1],'xml'] + "</doc>\n</root>"
    
    # write the xml column to an xml file for ENTS
    xml = ' '.join(df['xml'].tolist())
    with open(outputPathEnts,'w',encoding='utf-8') as output:
        output.write("""<?xml version="1.0" encoding="UTF-8"?>""" + xml)
        
    # get rid of the surplus first </doc> and add <root> for RELS
    v = df.at[0,'pred_rel']
    v2 = v.replace("</doc>","\n<root>\n")
    df.at[0,'pred_rel'] = v2
    
    # add a last </doc> to end the document and add </root> for RELS
    df.at[df.index[-1],'pred_rel'] = df.at[df.index[-1],'pred_rel'] + "</doc>\n</root>"
    
    # write the pred_rel column to an xml file for RELS
    xml = ' '.join(df['pred_rel'].tolist())
    with open(outputPathRels,'w',encoding='utf-8') as output:
        output.write("""<?xml version="1.0" encoding="UTF-8"?>""" + xml)
   
    return df


def createDf(document,wordsList):
    df0 = pd.DataFrame(document,columns =['word'])
    df0['pred_open'] = '</doc><doc id="'
    df0['pred_close'] = '">'
    df0['xml'] = ''
    df0['pred_rel'] = '</doc><doc id="' + df0.at[df0.index[0],'word'] + '">'
    words = list(itertools.chain.from_iterable(wordsList))
    df1 = pd.DataFrame(words,index =np.arange(len(words)),columns =['word'])
    df1['pred_open'] = ''
    df1['pred_close'] = ''
    df1['xml'] = ''
    df1['pred_rel'] = ''
    dfTmp = df0.append(df1)
    
    return dfTmp


def addPredNERToDf(dfTmp,predictionLists):
    for predictionList in predictionLists:
        for potentialList in predictionList:
            index0 = potentialList[0]
            index1 = potentialList[1]+1
            label = potentialList[2]
            for idx in range(index0,index1):
                if idx == index0:
                    if dfTmp.at[idx,'pred_open'] == '':
                        dfTmp.at[idx,'pred_open'] = "<" + label + ' id="' + str(index0) + "-" + str(index1-1) + '">'
                    else:
                        dfTmp.at[idx,'pred_open'] = dfTmp.at[idx,'pred_open'] + ',' + "<" + label + ' id="' + str(index0) + "-" + str(index1-1) + '">'
                idx_2_modif = index1 - 1
                if idx == idx_2_modif:
                    if dfTmp.at[idx_2_modif,'pred_close'] == '':
                        dfTmp.at[idx_2_modif,'pred_close'] = "</" + label + ">"
                    else:
                        dfTmp.at[idx_2_modif,'pred_close'] = dfTmp.at[idx_2_modif,'pred_close'] + ',' + "</" + label + ">"
    
    return dfTmp


def addPredRelToDf(dfTmp,predRelLists):
    for predRelList in predRelLists:
        for potentialList in predRelList:
            index0 = potentialList[0]
            index1 = potentialList[1]
            stringA = str(index0) + '-' + str(index1)
            label = potentialList[4]
            index2 = potentialList[2]
            index3 = potentialList[3]
            stringB = str(index2) + '-' + str(index3)
            for idx in range(index0,index1+1):
                if dfTmp.at[idx,'pred_rel'] == '':
                    dfTmp.at[idx,'pred_rel'] = dfTmp.at[idx,'pred_rel'] + '<relation L="' + stringA + '" ' + 'R="' + stringB + '">' + label + '</relation>'
                else:
                    dfTmp.at[idx,'pred_rel'] = dfTmp.at[idx,'pred_rel'] + '; ' + '<relation L="' + stringA + '" ' + 'R="' + stringB + '">' + label + '</relation>'

    return dfTmp


inputPath = './annotated_dataset.json'
outputPathEnts = './annotations_ents.xml'
outputPathRels = './annotations_rels.xml'
dfFull = convertJsonLFull(inputPath)


  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append(df1)
  df = df.append(dfLine, ignore_index=True)
  dfTmp = df0.append