# Describe GeM corpora

This notebook extracts basic statistics from GeM-annotated corpora. 

## 1. Import the necessary packages.

In [None]:
# For parsing XML
from lxml import etree as et
from collections import Counter
import glob
import csv

## 2. Locate the GeM XML files.

Enter the directory where the GeM corpus resides below.

In [None]:
gem_dir = 'test_xml'

Extract GeM identifiers based on filenames in the directory.

In [None]:
# Initialize a list of GeM base files.
gembase = []

# Loop over the files in the GeM corpus directory
for f in glob.glob(gem_dir + "/*.xml"):
    
    # If string "base" is not found in the filename, the find method returns -1. 
    # Append files that return a value other than -1 to the list of base units.
    if f.find("base") != -1:
        gembase.append(f)

## 3. Parse GeM annotation

In [None]:
# Initialize a dictionary for storing the statistics
corpus = {}

# Loop over the base files found in the directory
for bf in gembase:
    
    # Assign base, rst and layout files to variables
    basefile, rstfile, layfile = bf, bf.replace("base", "rst"), bf.replace("base", "layout")
    
    # Set identifier
    identifier = bf.split('/')[1]
    
    # Print status
    print "Parsing {} ...".format(basefile)
    
    # BASE LAYER
    # ----------
    # Parse the base layer file and get XML root
    baseroot = et.parse(basefile).getroot()
    
    # Initialize empty lists for different base unit types
    baseunits, visual_units, embedded_units = [], [], []
    
    # Find all base units
    for unit in baseroot.findall('./unit'):
        
        # Append all base units below the root node to the list of base units
        baseunits.append(unit.attrib['id'])
        
        # Add visual base units to their own list 
        if 'alt' in unit.attrib:
            visual_units.append(unit.attrib['id'])
            
        # Check whether the base units have children
        if unit.getchildren():
            # Loop over the possible children
            for em_unit in unit.getchildren():
                # Add visual base units to their own list
                if 'alt' in em_unit.attrib:
                    visual_units.append(em_unit.attrib['id'])
                # Add embedded base units to their own list
                embedded_units.append(em_unit.attrib['id'])
    
    # Store the base layer statistics into the corpus dictionary
    corpus[identifier] = {'basestats': [len(baseunits), len(visual_units), len(embedded_units)]}
    
    # RST LAYER
    # ---------
    # Parse the RST layer file and get XML root
    rstroot = et.parse(rstfile).getroot()
    
    # Initialize empty lists for different types of RST segments
    rst_segments, visual_segments = [], []
    
    # Find all RST segments
    for segment in rstroot.findall('./segmentation/*'):
        
        # Append all RST segments to the list of RST segments
        rst_segments.append(segment.attrib['id'])
        
        # Check if any RST segments cross-reference a visual base unit
        if segment.attrib['xref'] in visual_units:
            visual_segments.append(segment.attrib['id'])
            
    # Store the RST layer statistics into the corpus dictionary
    corpus[identifier].update({'rst_stats': [len(rst_segments), len(visual_segments)]})
    
    # Initialize empty lists and dictionaries for RST spans and structures
    relations, monospans, multispans, minispans, rst_structures = [], [], [], [], {}
    
    # Loop over each RST structure
    for number, structure in enumerate(rstroot.findall('.//rst-structure')):
                
        # Loop over the spans in each structure
        for span in structure:
            if span.tag == 'span':
                monospans.append(span.attrib['id'])
                relations.append(span.attrib['relation'])
            if span.tag == 'multi-span':
                multispans.append(span.attrib['id'])
                relations.append(span.attrib['relation'])
            if span.tag == 'mini-span':
                minispans.append(span.attrib['id'])
                relations.append(span.attrib['relation'])
        
        # Append stats for each RST structures into a list
        rst_structures['rst_structure_' + str(number + 1)] = {'monospans': len(monospans), 'multispans': len(multispans), 'minispans': len(minispans)}
        
    corpus[identifier].update({'rst_structures': rst_structures, 'rst_relations': Counter(relations).most_common()})
    
    # LAYOUT LAYER
    # ------------
    # Parse the layout layer file and get XML root
    layroot = et.parse(layfile).getroot()
    
    # Initialize empty lists for layout description
    layout_units, embedded_layout_units = [], []
    
    # Find all layout units
    for layunit in layroot.findall('./segmentation/*'):
        
        # Append all layout units to the list of layout units
        layout_units.append(layunit.attrib['xref'])
        
        # Check whether the layout units have children
        if layunit.getchildren():
            # Loop over the possible children
            for em_lunit in layunit.getchildren():
                # Add embedded base units to their own list
                embedded_layout_units.append(em_lunit.attrib['id'])
                
    corpus[identifier].update({'layout_segmentation': [len(layout_units), len(embedded_layout_units)]})

## 4. Write data to a CSV file

In [None]:
# Open csv file
with open('stats.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    # Write header row
    writer.writerow([' '] + ['Base units'] + ['Visual base units'] + ['Embedded base units'] + ['Layout units'] + ['Embedded layout units'] + ['RST segments'] + ['Visual RST segments'] + ['Unique RST relations'] + ['RST structures'])
    # Read values from dict
    for key, value in corpus.items():
        writer.writerow([key] + [value["basestats"][0]] + [value["basestats"][1]] + [value["basestats"][2]] + [value["layout_segmentation"][0]] + [value["layout_segmentation"][1]] + [value["rst_stats"][0]] + [value["rst_stats"][1]] + [len(value["rst_relations"])] + [len(value["rst_structures"])])

# Close csv file
csvfile.close()
print "Wrote stats into stats.csv ..."

## 5. Print RST stats

In [None]:
for key, value in corpus.items():
    print 'Filename:', key, '\n', "*** START ***"
    for v in value["rst_relations"]:
        print v[0], v[1]
    print "*** END ***"