In [10]:
import xml.etree.ElementTree as ET

import os

noad2wn = {}

excluding_list = {'be', 'do'}

def load_sense_map(map_filename = '../data/manual_map.txt'):
    f = open(map_filename, 'r')
    for line in f.readlines():
        # split into a noad sense and a list of wordnet senses
        noad, wn = line.strip().split()
        # list of wordnet senses
#        wns = wn.split(",")
        noad2wn[noad] = wn
    f.close()
    
def output_break(break_str):
    if break_str == "SPACE_BREAK":
        return " "
    elif break_str == "SENTENCE_BREAK":
        return "\n"
    elif break_str == "NO_BREAK":
        return " "
    else:  # "LINE_BREAK"
        return "\n"
    
def transform_corpus(in_filename, out_filename):
    
    tree = ET.parse(in_filename)
    root = tree.getroot()
    
    out_f = open(out_filename, 'w')
    
    hot_positions = []
    sentence = []
    word_index = 0
    
    for child in root:
        a = child.attrib
        
        # before going to the the next sentence
        if output_break(a['break_level']) == '\n':
            
            # if there is at least one ambiguous location
            if len(hot_positions) != 0:
                
                # output the sentence first
                for w in sentence:
                    if w != '\n':
                        out_f.write(w.encode('utf-8'))
                # output a list of senses for the ambiguous words in the current sentence
                out_f.write('\n'  + str(len(hot_positions)))
                for t in hot_positions:
                    out_f.write("\n#" + str(t[0]) + " ")
                    out_f.write(t[1].encode('utf-8') + " ")
                    out_f.write(t[2].encode('utf-8') + " ")
                    out_f.write(t[3].encode('utf-8'))
                out_f.write("\n")

            # clean up
            word_index = 0
            hot_positions = []
            sentence = []
        
        sentence.append(output_break(a['break_level']))
        sentence.append(a['text'])
        
        if 'sense' in a \
            and a['lemma'] not in excluding_list \
            and a['sense'] in noad2wn:
            hot_positions.append((word_index, a['lemma'], a['pos'], noad2wn[a['sense']]))
            
        word_index += 1
        
    # handle the last list of hot positions
    # if there is at least one ambiguous location
    if len(hot_positions) != 0:
        # output the sentence first
        for w in sentence:
            if w != '\n':
                out_f.write(w.encode('utf-8'))
        # output a list of senses for the ambiguous words in the current sentence
        out_f.write('\n'  + str(len(hot_positions)))
        for t in hot_positions:
            out_f.write("\n#" + str(t[0]) + " ")
            out_f.write(t[1].encode('utf-8') + " ")
            out_f.write(t[2].encode('utf-8') + " ")
            out_f.write(t[3].encode('utf-8'))
        out_f.write('\n')
    
    out_f.close()
    
if __name__ == '__main__':
    # load two mappings from NOAD senses to WN senses
    load_sense_map('../data/manual_map.txt')
    load_sense_map('../data/algorithmic_map.txt')
    
    # go through folder semcor
    for filename in os.listdir('../data/semcor/'):
        if filename.endswith(".xml"):
            print(filename)
            items = filename.split('.')
            transform_corpus('../data/semcor/' + filename, '../data/semcor_txt/' + items[0] + '.txt')