# Visualizing RST structures stored in GeM corpora

## 1. Import the necessary packages.

In [167]:
# For parsing XML
import xml.etree.ElementTree as et

# For drawing the graphs
import pygraphviz as pgv

# For formatting the content
import textwrap

## 2. Parse the GeM XML files.

In [168]:
basefile = et.parse('test_xml/2002-she-base-1.xml') # Base layer
rstfile = et.parse('test_xml/2002-she-rst-1.xml') # RST layer

Get the root elements.

In [169]:
baseroot = basefile.getroot()
rstroot = rstfile.getroot()

Parse the base units.

In [170]:
base_units = {} # Set up an empty dictionary

for unit in baseroot:
    unit_id = unit.attrib['id']
    if 'alt' in unit.attrib:
        unit_content = unit.attrib['alt']
    else:
        unit_content = textwrap.fill(''.join(unit.itertext()), 35)
        
    base_units[unit_id] = unit_content # Populate dictionary with key (id) and value (content) pairs

Parse the RST units.

In [171]:
rst_units = {}

for rstunit in rstroot[0]: # [0] to access the nested <segment> element
    rstunit_id = rstunit.attrib['id']
    rstunit_xref = rstunit.attrib['xref']
    rstunit_content = base_units[rstunit_xref]
    rst_units[rstunit_id] = rstunit_content

Set up graph.

In [172]:
rst_graph = pgv.AGraph(directed = True, strict = False, ranksep='1.0', rankdir="TB")

Parse the RST relations.

In [173]:
rst_relations = {}

for span in rstroot[1]: # [1] to access the nested <rst-structure> element
    rst_relations[span.attrib['id']] = span.attrib['relation']

Parse the RST spans.

In [174]:
rst_graph.add_nodes_from(rst_relations) # add relations
rst_graph.add_nodes_from(rst_units) # add segments

Set node properties for relations and segments.

In [175]:
for node in rst_graph.nodes():
    if node in rst_units:
        rst_graph.get_node(node).attr['label'] = rst_units[node]
        rst_graph.get_node(node).attr['shape'] = 'box'
        rst_graph.get_node(node).attr['fontsize'] = '9.0'
    if node in rst_relations:
        rst_graph.get_node(node).attr['label'] = rst_relations[node]
        rst_graph.get_node(node).attr['shape'] = 'none'
        rst_graph.get_node(node).attr['style'] = 'filled'
        rst_graph.get_node(node).attr['fillcolor'] = 'gray82'
        rst_graph.get_node(node).attr['fontcolor'] = 'crimson'

Add edges.

In [176]:
for span in rstroot[1]:
    if span.tag == 'multi-span':
        multispan, nuclei, relation = span.attrib['id'], span.attrib['nuclei'].split(), span.attrib['relation']
        if span.attrib['id'] == multispan:
            for n in nuclei:
                rst_graph.add_edge(multispan, n, label='n')
    if span.tag == 'span':
        spanid, nucleus, satellites, relation = span.attrib['id'], span.attrib['nucleus'].split(), span.attrib['satellites'].split(), span.attrib['relation']
        if span.attrib['id'] == spanid:
            for n in nucleus:
                rst_graph.add_edge(spanid, n, label='n', dirType='forward', rankType='same')
            for s in satellites:
                rst_graph.add_edge(s, spanid, label='s', dirType='none', rankType='same')
    for title in span.iter('title'):
        target_id = span.attrib['id']
        title_xref = title.attrib['xref']
        if span.attrib['id'] == target_id:
            rst_graph.add_edge(title_xref, target_id, label='title')

In [177]:
rst_graph.draw("test.png", format = "png", prog = 'dot')

In [178]:
print rst_graph.string()

digraph {
	graph [rankdir=TB,
		ranksep=1.0
	];
	node [label="\N"];
	"span-10.02"	 [fillcolor=gray82,
		fontcolor=crimson,
		label=elaboration,
		shape=none,
		style=filled];
	"s-10.04"	 [fontsize=9.0,
		label="Parks and gardens",
		shape=box];
	"span-10.02" -> "s-10.04"	 [dirType=forward,
		label=n,
		rankType=same];
	"span-10.03"	 [fillcolor=gray82,
		fontcolor=crimson,
		label=elaboration,
		shape=none,
		style=filled];
	"span-10.03" -> "span-10.02"	 [dirType=none,
		label=s,
		rankType=same];
	"s-10.05"	 [fontsize=9.0,
		label="Helsinki has dozens of parks and
green areas which cover about 5,000
hectares in all.",
		shape=box];
	"span-10.03" -> "s-10.05"	 [dirType=forward,
		label=n,
		rankType=same];
	"span-10.01"	 [fillcolor=gray82,
		fontcolor=crimson,
		label=joint,
		shape=none,
		style=filled];
	"span-10.01" -> "span-10.02"	 [label=title];
	"s-10.02"	 [fontsize=9.0,
		label="Summer oases",
		shape=box];
	"span-10.01" -> "s-10.02"	 [label=n];
	"s-10.03"	 [fontsize=9.0,
		label