## Import library

In [1]:
import xml.etree.ElementTree as ET
import re
from os import listdir

below are the regexs needed for the parsing process:

In [2]:
topic_files = listdir('./topics/')
# The regex used to capture the words segements
p_pattern = r'#id\((\w+\.\w\.\w+)\)\.{2}id\((\w+\.\w\.\w+)\)'
s_pattern = r'#id\((\w+\.\w\.\w+)\)'
two_pattern = r'#id\((\w+\.\w\.\w+?)(\d+)\)\.{2}id\((\w+\.\w\.\w+?)(\d+)\)'
one_pattern = r'#id\((\w+\.\w\.\w+?)(\d+)\)'
no_pattern = '\w+\.\w\.\w+?(\d+)'
name_space = {'nite':'http://nite.sourceforge.net/'}
words_pattern = '(.+?)words'
topic_pattern = '\w+\.?'

## Scenario:
- A linebreak after each < nite: child > tag of the topic files.
- A linebreak after each segment.
- 10 stars after each root topic .


## The parsing process:
The parsing process contains mainly three phases:
1. Get the bounds of topics and < nite:child >, using "./topics/*.xml"
2. Slicing each nite:child into segments, using "./segments/*.xml"
3. Find exact word in word files and generate the txt files, using "./words/*.xml"

The parsing method for one single transcript is implemented in the below:

In [3]:
def parse_transcript(topic):
    topic_tree = ET.parse(topic)
    topic_root = topic_tree.getroot()
    topic_name = re.match(topic_pattern,topic_root.attrib['{http://nite.sourceforge.net/}id']).group()
    word_segments = []
    #1. Get the bounds of topics and < nite:child 
    #get word segments without the paragraph bounds
    for child in topic_root.findall('.//nite:child',name_space):
        try:
            word_segments.append(list(re.search(p_pattern,child.attrib['href']).groups()))
        except:
            word_segments.append(list(re.search(s_pattern,child.attrib['href']).groups()))
    #get topic bounds
    bounds = set()
    for topic in topic_root.findall('topic'):
        try:
            matched = re.search(p_pattern,topic.findall('.//nite:child[last()]',name_space)[-1].attrib['href'])
            bounds.add(matched.groups())
        except:
            matched = re.search(s_pattern,topic.findall('.//nite:child[last()]',name_space)[-1].attrib['href'])
            bounds.add(matched.groups())        
    #add topical bounds
    for seg in word_segments:
        if tuple(seg) in bounds:
            seg.append("**********")
    #2. get paragraph bounds,using the segments.xml files as input
    seg_file_pattern1 = r'\w{7}\.\w\.'
    seg_file_pattern2 = r'\w{6}\.\w\.'
    wseg_with_parapgraph = []
    for seg in word_segments:
        if len(seg) == 1:
            wseg_with_parapgraph.append(seg)
        else:# the length of segment is 2 or 3, both 2 and 3 can indicate its a topical boundary
            try:
                file_name = re.match(seg_file_pattern1, seg[0]).group(0) + "segments.xml"
            except:
                file_name = re.match(seg_file_pattern2, seg[0]).group(0) + "segments.xml"
            tree =  ET.parse('./segments/'+file_name)
            root = tree.getroot()
            paras = []
            op_no = int(re.search(no_pattern,seg[0]).group(1))
            try:
                ed_no = int(re.search(no_pattern,seg[1]).group(1))#get start and end number of word in the seg
                for segment in root.findall('.//nite:child', name_space):#get paragraph bounds
                    if op_no > int(re.search(one_pattern,segment.attrib['href']).group(2)):
                        try:
                            #if the seg is totally in the segment
                            if ed_no < int(re.search(two_pattern,segment.attrib['href']).group(4)):
                                paras.append(seg[0] + ',' + seg[1])
                                break
                            elif ed_no > int(re.search(two_pattern,segment.attrib['href']).group(4)):#ed_no > end word_no of the segment
                                if op_no > int(re.search(two_pattern,segment.attrib['href']).group(4)):
                                    pass
                                elif op_no < int(re.search(two_pattern,segment.attrib['href']).group(4)):#op seg < end segment
                                    paras.append(seg[0] + ',' + re.search(p_pattern,segment.attrib['href']).group(2))
                                else:#end segment = op_no
                                    paras.append(seg[0])
                            else:#end_no = end segment
                                paras.append(seg[0] + ',' + seg[1])
                                break
                        except:# the segment contains only one word,don't include
                            pass
                    elif op_no == int(re.search(one_pattern,segment.attrib['href']).group(2)):
                        try:
                            if ed_no == int(re.search(two_pattern,segment.attrib['href']).group(4)):
                                paras.append(seg[0] + ',' + seg[1])
                                break                       
                            if ed_no > int(re.search(two_pattern,segment.attrib['href']).group(4)):#segment is totally in seg, add the whole segment
                                paras.append(','.join(re.search(p_pattern, segment.attrib['href']).groups()))
                            else:#end_no < end_segemnt,so we can call an end for this iteration  
                                paras.append(seg[0] + ',' + seg[1])
                                break               
                        except:#only one word in the segment,and it equals op_no
                            paras.append(seg[0])
                    else:#op_no is < the op_segment,then 2 situations
                        try:
                            if ed_no <= int(re.search(two_pattern,segment.attrib['href']).group(4)):
                                paras.append(re.search(s_pattern,segment.attrib['href']).group(1) + ',' + seg[1])
                                break
                            else:#ed_no > end segment
                                paras.append(','.join(re.search(p_pattern, segment.attrib['href']).groups()))
                        except:#only one word and is between interval of seg, add
                            if int(re.search(one_pattern,segment.attrib['href']).group(2)) < ed_no:
                                paras.append(re.search(s_pattern,segment.attrib['href']).group(1))
                            elif int(re.search(one_pattern,segment.attrib['href']).group(2)) == ed_no:
                                paras.append(seg[1])
                                break
                #if len(paras) == 0:
                    #paras.append(seg[0] + ',' + seg[1])
                if seg[-1] == '**********':
                    paras.append('**********')
                wseg_with_parapgraph.append(paras)                
            except:#indicates that though of length 2, this seg is single plus "***"
                wseg_with_parapgraph.append(seg)
                
    #3. start of writing txt process,using the words.xml files as input        
    with open('./txt_files/'+ topic_name + 'txt','w') as f:
        for e in wseg_with_parapgraph:
            for s in e:
                if s == '**********':
                    f.write(s + '\n')
                else:   
                    word_file = './words/'+ re.match(words_pattern,s).group() + '.xml'
                    tree = ET.parse(word_file)
                    root = tree.getroot()
                    sentence = s.split(',')
                    if len(sentence) == 1:# even single word sentence will be length of 2 here, 1 may mean vocal which can be dropped
                        try:
                            f.write(' ' + root.find('./w[@nite:id="{}"]'.format(sentence[0]),name_space).text + '\n')
                        except:
                            pass
                    else:# length is 2, have start and end word
                        start = sentence[0]
                        end = sentence[1]
                        no_word = 0
                        begin = False
                        for word in root.findall('*'):
                            if begin:
                                if word.text:
                                    no_word += f.write(' ' + word.text) 
                                else:
                                    pass
                            if word.attrib['{http://nite.sourceforge.net/}id'] == start:
                                begin = True                               
                                if word.text:
                                    no_word = f.write(' ' + word.text) 
                                else:
                                    pass
                            if word.attrib['{http://nite.sourceforge.net/}id'] == end:      
                                if no_word:
                                    f.write('\n')
                                break

## Run the whole process

In [4]:
%%time
for file in sorted(topic_files):
    file = './topics/' + file
    parse_transcript(file)

CPU times: user 7min 1s, sys: 2.42 s, total: 7min 4s
Wall time: 7min 4s


## Conclusion
1. When using element tree for xml parsing, it needs close attention when facing sub-element
2. Attension should be paid to xml with namesapce
3. Re module is handy tool for extracting info from strings