## Parse the .xml document from OpenSubtitle
This notebook is used for parsing OpenSubtitles 2018.

1. We have 446612 subtitles in .xml files in total; and approximately 90% of them are in duration between half an hour and three hours. Too short files may be trailer or song lyrics (eg. 666); too long files may be lectures.
2. The subtitles are parsed in such format:


    `<s id = 'NUM'>`: In each `<s>` tag it should mean a sentence by certain character.

    `<time id="T1S" value="00:00:00,668" /> <time id="T1E" value="00:00:02,918" />`: In each `<time>` tag it should mean subtitles shown in one screen.

    Genereally: 
    In each `<s>` tag there is one pair of `<time>` tag;
    In one `<s>` tag if there are more than one pairs of `<time>` tag, it means the sentence by one character is too long and is separeted to put into several screens;
    In one `<time>` tag if there are more than one `<s>` tags, it means different senetences by more than one characters are put in one screen. And these sentences often begin with symbol '-'.


Pre-processing include:
1. Download tokenized English subtitles from http://opus.nlpl.eu/OpenSubtitles2018.php;
2. Clean the data: replace \", \` with \'; replace n \'t with n\'t; replace in \' with ing; replace ing \' with ing;
3. Select movies with duration between 0.5 - 3 hours;
4. Find sentences in each `<s>` tag (concatenate sentences if in several `<s>` tags; split sentence with '- ')(setences ending with ',' with be considered as part of next sentence. 
5. Expansion, delete brackets and special symbols, replace numbers with NUM, delete non-dialog (eg. 'Season 1 Episode 1', add <EOS\>;
6. Save the timestamp in seconds.


In [1]:
import xml.etree.ElementTree as ET
import operator
import os,re
from datetime import timedelta
import linecache

In [2]:
def time2sec(string):
    """ This function is used for converting string of time stamp in subtitle into seconds."""
    string = string.replace(' ','')
    d = list(map(float,re.split('\.|,|;|:',string)))
    if len(d) > 3: # the format of time stamp is not unified, some of them have milli-second
        t = timedelta(hours = d[0],minutes = d[1],seconds = d[2]+d[3]/1000)
    else:
        t = timedelta(hours = d[0],minutes = d[1],seconds = d[2])
    return(t.total_seconds())

In [3]:
def multiple_replace(dict_, text):
    """Replace text with multi-words in a dictionary."""
    # Create a regular expression  from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict_.keys())))
    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict_[mo.string[mo.start():mo.end()]], text) 

dict_sub = {'\"':'\'','´':'\'',
            'n \'t':' n\'t', 'in \'':'ing', 'ing \'':'ing'}

In [4]:
def OpenSubtitleSentence(file_path,dict_=dict_sub):
    # import data
    tree = ET.parse(file_path)
    root = tree.getroot()
    time_list = []
    s_list = []
    s_tmp = ''
    flag_s = 0
    flag_e = 0
    flag_m = 0
    
    # get duration of input document
    for sub in root.iter('subtitle'):
        for duration in sub.iter('duration'):
            duration_time = time2sec(duration.text.split(',')[0])    
    # if the duration is too short or too long we ignore this document
    if (duration_time < 3600*0.5) | (duration_time > 3600*3):
        return [],[]
        
    for child in root:
        if child.tag != 's':
            continue
    
        # get the list of start and end time of each sentence
        # if a movie is in 'E-S' pattern in the whole dialog then time_list = []
        time_tmp = []
        for time in child.iter(tag='time'):
            # time_tmp.append(time.attrib['value'])
            time_tmp.extend([time.attrib['id'],time.attrib['value']])
        
        if not time_tmp:
            flag_m = 1
        elif (time_tmp[0].endswith('S')) & (time_tmp[-2].endswith('E')):
            time_list.extend([time_tmp[1],time_tmp[-1]])
        elif (time_tmp[0].endswith('E')) & (time_tmp[-2].endswith('S')):
            flag_m = 1
        elif (time_tmp[0].endswith('S')) & (time_tmp[-2].endswith('S')):
            time_list.extend([time_tmp[1]])
            flag_s = 1
        elif (time_tmp[0].endswith('E')) & (time_tmp[-2].endswith('E')):
            time_list.extend([time_tmp[-1]])
            flag_e = 1
        
        s = [] # sentence in this loop
        if child.itertext(): # sentence in each s tag
            s.append(''.join(child.itertext()))
        s = ' '.join(s[0].split())
        a = s

        if flag_s:
            s_tmp = a
            flag_s = 0
            continue
        
        if flag_m:
            try:
                s_tmp = s_tmp +' ' + a
            except:
                print(child.attrib)
            flag_m = 0
            continue
            
        if flag_e:
            a = s_tmp + ' ' + a
            flag_e = 0
            s_tmp = ''
        
        a = multiple_replace(dict_, a)
        a = multiple_replace(dict_, a)
        a = a.replace('\\','')
        a = re.sub('ca n\'t','can n\'t',a)
        a = re.sub('Ca n\'t','Can n\'t',a)
        
        # remove brackets and contents in the brackets/special symbols
        a = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\♪.*?♪|\\#.*?#|\\=.*?=|\\¶.*?¶", "", a)
        a = re.sub('[0-9]+', '<NUM>', a) # replace specific number with NUM
        # a = a.replace('-','')
        a = a.lstrip().rstrip()# remove spaces before or after sentence
            
        if (not len(a)) | (any(re.findall(r' Season.*?Episode |Subtitles|Subtitle | Episode ',a, re.IGNORECASE))) |(not (any(re.findall(r'\.|,|\?|!|\'|\"',a)))): 
            # skip null sentence; delete  '- Season x Episode x -' or 'Subtitles by' lines;
            # delete non-dialog sentences, eg. titles of episode
            del time_list[-2:]
            continue
            
        s_list.append(a)
    # convert to seconds
    time_second = []
    for time in time_list:
        # print(child.attrib,time)
        time_second.append(time2sec(time))
    time = time_second[::2]
    
#     time_diff = list(map(operator.sub,time_second[::2][1:],time_second[1::2][:-1]))
    
    return s_list,time

In [5]:
def SaveSentence(save_path,s_,time_):
    n = 1
    new_path = save_path
    while os.path.exists(new_path):
        new_path = save_path.split('.')[0]+'_'+str(n)+'.txt'
        n +=1
    
    f = open(new_path,'a')
    for i in range(len(s_)):
        # to find if short dialogue in one sentence, beginning with '- '
        tmp_sentence = list(filter(None,s_[i].split('- '))) 
        tmp_time = str(time_[i])
        for s in tmp_sentence:
            a = tmp_time + '|' + '<GO>' + s + '<EOS>\n'
            f.write(a)
    return

In [6]:
# count the total number of subtitles
file_count=0
folder_path = "/Volumes/Files/en/OpenSubtitles/"

# for dirpath, dirnames, filenames in os.walk('/Volumes/Files/en/OpenSubtitles/xml/en/'):
#     for file in filenames:
#         tmp_path = os.path.join(dirpath,file)
#         f = open(folder_path+'AllFilePath.txt','a')
#         f.write(tmp_path)
#         f.write('\n')
#         file_count=file_count+1
file_count = 446612
print(file_count)
# file_count = 446612

446612


In [7]:
# the root data folder 
#g = os.walk("/Volumes/Files/en/OpenSubtitles/xml/en/")
source_dir = os.walk('/Users/yan/Documents/document/EPFL/MA2/semesterprj/datasets/OpenSubtitle/')
save_dir = '/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/processed_data/OpenSubtitle/'
i = 0
# looping over all .xml files in that folder
try:
    for path,dir_list,file_list in source_dir:
        for file_name in file_list:
            if file_name.endswith(".xml"): 
                tmp_path = os.path.join(path,file_name)    
                s_list,time_ = OpenSubtitleSentence(tmp_path)
                save_path = save_dir+os.path.basename(tmp_path).split('.')[0] + '.txt' 
                if len(s_list) & len(time_): 
                # if s_list and time_ not Null, then save the result
                    SaveSentence(save_path,s_list,time_)
                    i += 1
                    if i%1000==0:print(i)
except KeyboardInterrupt:
    print("Stopped.")
    pass

Stopped.


---

In [25]:
file_count=0
folderpath = "/Volumes/Files/en/OpenSubtitles/txt/"

for dirpath, dirnames, filenames in os.walk(folderpath):
    for file in filenames:
        file_count=file_count+1
print(file_count)


349313


---