In [1]:
# lxml has cdata type
from lxml import etree

In [2]:
root = etree.Element('wfu')
text = etree.SubElement(root, 'TEXT')
text.text = 'i am here'
tags = etree.SubElement(root, 'TAGS')

In [3]:
def prettyprint(element, **kwargs):
    xml = etree.tostring(element, pretty_print=True, **kwargs)
    print(xml.decode(), end='')

In [4]:
prettyprint(root)

<wfu>
  <TEXT>i am here</TEXT>
  <TAGS/>
</wfu>


In [5]:
text.text = etree.CDATA('i am here')

In [6]:
prettyprint(root)

<wfu>
  <TEXT><![CDATA[i am here]]></TEXT>
  <TAGS/>
</wfu>


In [7]:
import pandas as pd
import json

In [8]:
dat = pd.read_csv('wfudata/wfu_annotated.csv')

In [9]:
types_set = set()

for record in dat['JSON_DATA']:
    record = json.loads(record)
    types = [x['type'] for x in record['asets']]
    types_set = types_set.union(set(types))

In [10]:
types_set

{'AGE',
 'DATE',
 'EMAIL',
 'HOSPITAL',
 'IDNUM',
 'INITIALS',
 'IPADDRESS',
 'LOCATION',
 'NAME',
 'OTHER',
 'PHONE',
 'SEGMENT',
 'URL',
 'lex',
 'zone'}

In [11]:
types_to_include = set([
 'AGE',
 'DATE',
 'EMAIL',
 'HOSPITAL',
 'IDNUM',
 'INITIALS',
 'IPADDRESS',
 'LOCATION',
 'NAME',
 'OTHER',
 'PHONE',
 'URL',
])

In [12]:
def prepare_tag(id, start, end, text, etype):
    element = etree.Element(etype, id='P'+str(id), start=str(start), end=str(end), text=text, TYPE=etype, comment='')
    return element

In [13]:
ele = prepare_tag(0, 0, 15, 'abc', 'DATE')
etree.tostring(ele, pretty_print=True)

b'<DATE id="P0" start="0" end="15" text="abc" TYPE="DATE" comment=""/>\n'

In [14]:
ele.attrib['start']

'0'

In [15]:
record = json.loads(dat['JSON_DATA'][0])

In [16]:
record['signal'][18:35], record['signal'][913:940]

(':  S12-24679\nRECE', 'N , MD10/24/2012\n14:19:16**')

In [17]:
elements = []

for x in record['asets']:
    if x['type'] in types_to_include:
        for annot in x['annots']:
            start = annot[0]
            end = annot[1]
            elements.append(prepare_tag(len(elements), start, end, record['signal'][start:end], x['type']))

for x in sorted(elements, key=lambda x: x.attrib['start']):
    tags.append(x)

In [18]:
prettyprint(root)

<wfu>
  <TEXT><![CDATA[i am here]]></TEXT>
  <TAGS>
    <NAME id="P6" start="112" end="126" text="MILLER, WARD M" TYPE="NAME" comment=""/>
    <NAME id="P8" start="1181" end="1196" text="Dr. Sirintrapun" TYPE="NAME" comment=""/>
    <DATE id="P4" start="1200" end="1228" text="October 19, 2012 at 12:25 PM" TYPE="DATE" comment=""/>
    <NAME id="P9" start="1302" end="1325" text="SAHUSSAPONT SIRINTRAPUN" TYPE="NAME" comment=""/>
    <NAME id="P10" start="1776" end="1789" text="Sharon N Sims" TYPE="NAME" comment=""/>
    <IDNUM id="P0" start="21" end="30" text="S12-24679" TYPE="IDNUM" comment=""/>
    <DATE id="P2" start="41" end="51" text="10/19/2012" TYPE="DATE" comment=""/>
    <IDNUM id="P1" start="681" end="690" text="P12-16687" TYPE="IDNUM" comment=""/>
    <NAME id="P5" start="73" end="91" text="ADRIAN LUCIAN LATA" TYPE="NAME" comment=""/>
    <NAME id="P7" start="901" end="914" text="SIMON BERGMAN" TYPE="NAME" comment=""/>
    <DATE id="P3" start="917" end="938" text="MD10/24/2012&