In [1]:
import xml.etree.ElementTree as ET
import re
import pickle
import json
import os

In [2]:
tag_to_type_mapping = {'bodyText':'body', 'listItem':'body', 'figureCaption':'figure_captions', 'tableCaption':'table_captions', 'table':'tables', 'figure':'figures', 'note':'note', 'footnote':'note'}

In [3]:
def assign_section_heading_type(sectionHeader):

	exp_regex = re.compile(r'experiment|evaluat|result|empiric|analys|compar|perform|discussion')
	abs_regex = re.compile(r'abstract|summary')
	int_regex = re.compile(r'introduction')
	rel_regex = re.compile(r'related work|background|previous work')
	con_regex = re.compile(r'conclusion|future work')
	ref_regex = re.compile(r'referenc')


	if exp_regex.search(sectionHeader):
		heading_type = 'experiment'
	elif abs_regex.search(sectionHeader):
		heading_type = 'abstract'
	elif int_regex.search(sectionHeader):
		heading_type = 'introduction'
	elif rel_regex.search(sectionHeader):
		heading_type = 'related_work'
	elif con_regex.search(sectionHeader):
		heading_type = 'conclusion'
	elif ref_regex.search(sectionHeader):
		heading_type = 'reference'
	else:
		heading_type = 'other_sections'
		# print('section', sectionHeader.lstrip())

	return heading_type

In [4]:
def sectlabel(tree_root):

    section_labeling = {'overall':'', 'experiment':{'body':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':'', 'overall':''}, 'conclusion':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 'discussion':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 'other_sections':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 'related_work':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 'abstract':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 'introduction':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}}

    curr_section = {'experiment':-1, 'abstract':-1, 'introduction':-1, 'related_work':-1, 'other_sections':-1, 'discussion':-1, 'conclusion':-1, 'reference':-1}

    for element in tree_root.iterfind('variant/'):

        if element.tag=='sectionHeader':

            for sec in curr_section:
                curr_section[sec]=-1

            sectionHeader = element.text.lower()
            heading_type = assign_section_heading_type(sectionHeader)

            if heading_type=='reference':
                continue

            curr_section[heading_type] = 0

        elif 'Header' in element.tag:

            i = 0
            while i<4:
                if 'sub'*i+'sectionHeader'==element.tag:
                    break
                i+=1

            for sec in curr_section:
                if curr_section[sec]>=i:
                    curr_section[sec]=-1

            sectionHeader = element.text.lower()
            heading_type = assign_section_heading_type(sectionHeader)

            if heading_type=='reference':
                continue

            if curr_section[heading_type]>i or curr_section[heading_type]<0:
                curr_section[heading_type] = i

        elif element.tag in ['bodyText', 'listItem', 'figureCaption', 'tableCaption', 'table', 'figure', 'note', 'footnote']:
            element_text = element.text.lower().replace('-\n', '').replace('\n', ' ').rstrip()+' '
            section_labeling['overall']+=element_text
            for sec in curr_section:
                if curr_section[sec]>=0:
                    if sec=='other_sections':
                        if any([(curr_section[sec]>=0 and sec not in ['other_sections']) for sec in curr_section]):
                            continue
                    section_labeling[sec]['overall']+=element_text
                    section_labeling[sec][tag_to_type_mapping[element.tag]]+=element_text

        elif element.tag in ['title', 'author', 'affiliation', 'page', 'equation', 'reference', 'email', 'address', 'construct']:
            pass

        else:
            # print('tag', element.tag)
            pass

    return section_labeling

In [5]:
section_labels = {}
for file in os.listdir('xmls/'):
    tree = ET.parse("xmls/"+file)
    root = tree.getroot()
    id = file[:8]
    for element in root.iterfind("algorithm"):
        if(element.attrib['name']=="SectLabel"):
            sect_labelling = sectlabel(element)
            section_labels[id] = sect_labelling

In [6]:
pickle.dump(section_labels, open("pickles_data/section_labels.pkl", "wb"))