# Visualizing RST structures in GeM corpora

## 1. Import the necessary packages.

In [1]:
# For parsing XML
import xml.etree.ElementTree as et

# For drawing the graphs
import pygraphviz as pgv

## 2. Parse the GeM XML files.

In [2]:
basefile = et.parse('test_xml/2002-she-base-1.xml') # Base layer
rstfile = et.parse('test_xml/2002-she-rst-1.xml') # RST layer

Get the root elements.

In [3]:
baseroot = basefile.getroot()
rstroot = rstfile.getroot()

Parse the base units.

In [4]:
base_units = {} # Set up an empty dictionary

for unit in baseroot:
    unit_id = unit.attrib['id']
    if 'alt' in unit.attrib:
        unit_content = unit.attrib['alt']
    else:
        unit_content = unit.text
    base_units[unit_id] = unit_content # Populate dictionary with key (id) and value (content) pairs

Parse the RST units.

In [5]:
rst_units = {}

for rstunit in rstroot[0]: # [0] to access the nested <segment> element
    rstunit_id = rstunit.attrib['id']
    rstunit_xref = rstunit.attrib['xref']
    rstunit_content = base_units[rstunit_xref]
    rst_units[rstunit_id] = rstunit_content

Parse the RST spans.

In [12]:
relations = []
edges = []

for span in rstroot[1]:
    if span.tag == 'multi-span':
        multispan, nuclei, relation = span.attrib['id'], span.attrib['nuclei'].split(), span.attrib['relation']
        if span.attrib['id'] == multispan:
            relations.append((multispan, relation))
            for n in nuclei:
                edges.append((multispan, n))
    if span.tag == 'span':
        spanid, nucleus, satellites, relation = span.attrib['id'], span.attrib['nucleus'].split(), span.attrib['satellites'].split(), span.attrib['relation']
        if span.attrib['id'] == spanid:
            relations.append((spanid, relation))
            for n in nucleus:
                edges.append((spanid, n))
            for s in satellites:
                edges.append((s, spanid))
    for title in span.iter('title'):
        target_id = span.attrib['id']
        title_xref = title.attrib['xref']
        if span.attrib['id'] == target_id:
            edges.append((title_xref, target_id))

## 2. Draw a graph of RST relations

Set up a directed graph.

In [13]:
rst_graph = pgv.AGraph(strict = False)
rst_graph.add_edges_from(edges)
rst_graph.layout(prog = 'dot')

In [15]:
for node in rst_graph.nodes():
    for r in relations:
        if r[0] == node:
            print node, r[1]

span-10.01 joint
span-10.02 elaboration
span-10.03 elaboration
span-10.04 preparation
span-10.05 restatement
span-10.06 elaboration
span-10.07 joint
span-10.08 elaboration


In [9]:
rst_graph.nodes()

[u'span-10.01',
 u's-10.02',
 u's-10.03',
 u'span-10.02',
 u's-10.04',
 u'span-10.03',
 u's-10.05',
 u's-10.06',
 u'span-10.04',
 u's-10.08',
 u's-10.07',
 u'span-10.05',
 u's-10.01',
 u's-10.11-m',
 u'span-10.06',
 u'span-10.07',
 u's-10.09',
 u's-10.10',
 u'span-10.08',
 u's-10.11',
 u's-10.14',
 u's-10.12',
 u's-10.13']

In [10]:
rst_graph.draw("test.png", format = "png")