# Visualizing RST structures in GeM corpora

## 1. Import the necessary packages.

In [1]:
# For parsing XML
import xml.etree.ElementTree as et

# For drawing the graphs
import pygraphviz as pgv

## 2. Parse the GeM XML files.

In [2]:
basefile = et.parse('test_xml/2002-she-base-1.xml') # Base layer
rstfile = et.parse('test_xml/2002-she-rst-1.xml') # RST layer

Get the root elements.

In [3]:
baseroot = basefile.getroot()
rstroot = rstfile.getroot()

Parse the base units.

In [4]:
base_units = {} # Set up an empty dictionary

for unit in baseroot:
    unit_id = unit.attrib['id']
    if 'alt' in unit.attrib:
        unit_content = unit.attrib['alt']
    else:
        unit_content = unit.text
    base_units[unit_id] = unit_content # Populate dictionary with key (id) and value (content) pairs

Parse the RST units.

In [5]:
rst_units = {}

for rstunit in rstroot[0]: # [0] to access the nested <segment> element
    rstunit_id = rstunit.attrib['id']
    rstunit_xref = rstunit.attrib['xref']
    rstunit_content = base_units[rstunit_xref]
    rst_units[rstunit_id] = rstunit_content

Parse the RST spans.

In [6]:
rst_spans = {}

for span in rstroot[1]: # [1] to access the nested <rst-structure> element
    span_id = span.attrib['id']
 
    if 'multi-span' in span.tag: # Find multinuclear spans
        span_type = 'multinuclear'
    elif 'span' in span.tag: # Find mononuclear spans 
        span_type = 'mononuclear'
    if span_type == 'multinuclear':
        span_n = span.attrib['nuclei']
        span_s = 'None'
    if span_type == 'mononuclear':
        span_n = span.attrib['nucleus']
        span_s = span.attrib['satellites']
    span_relation = span.attrib['relation']
    rst_spans[span_id] = span_type, span_relation, span_n, span_s
    
### TO DO: The title segment's broken    
    
    for t in span.iter('title'): # Find title spans
        title_id = span.attrib['id']
        title_xref = t.attrib['xref']
        rst_spans[title_id] = 'title', 'title', title_xref, 'None'

<img src="title_broken.png">

## 2. Draw a graph of RST relations

Set up a directed graph.

In [7]:
rst_graph = pgv.AGraph(strict = False)
rst_graph.layout()

In [8]:
for key, value in rst_spans.iteritems():
    rst_graph.add_node(key) # Add relation nodes
    
    if value[0] == 'mononuclear':
        nucleus = value[2].split()
        satellites = value[3].split()
        for n in nucleus:
            rst_graph.add_edge(key, n)
        for s in satellites:
            rst_graph.add_edge(key, s)
    if value[0] == 'multinuclear':
        nuclei = value[2].split()
        for n in nuclei:
            rst_graph.add_edge(key, n)
    if value[0] == 'title':
        title = value[2].split()
        for t in title:
            rst_graph.add_edge(key, t)

In [9]:
rst_graph.draw("test.png", format = "png", prog = "dot")

<img src="test.png">