In [104]:
from bs4 import BeautifulSoup
import re
import os

In [110]:
#input_path = "/Users/alp/Documents/TWB/data/Hausa/TWB-TMs/Kato_TM.zip Folder/tmx"
input_path = "/Users/alp/Documents/TWB/data/Hausa/TWB-TMs/TMX Memsource"
#input_path = '/Users/alp/Documents/TWB/data/Hausa/TWB-TMs/Kato_TM.zip Folder/tmx/13326.tmx'

In [121]:
src_lang_tags = ['en-US', 'en-GB', 'en-gb']
tgt_lang_tags = ['ha-HAU', 'ha-NG', 'ha']

In [112]:
src_output_path = "/Users/alp/Documents/TWB/data/Hausa/TWB-TMs/Memsource_TMX.en"
tgt_output_path = "/Users/alp/Documents/TWB/data/Hausa/TWB-TMs/Memsource_TMX.ha"

In [115]:
if os.path.isdir(input_path):
    tmx_paths = [os.path.join(input_path, i) for i in os.listdir(input_path) if i.endswith('.tmx')]
elif os.path.splitext(input_path)[1] == '.tmx':
    tmx_paths = [input_path]
else:
    print("Input path needs to be a directory or a TMX file")
    
print("%i files"%len(tmx_paths))

7 files


In [117]:
def clean_sent(text):
    text = text.replace("\n", '')
    text = text.replace("&#10;", '')
    text = text.replace("\r", '')
    text = text.replace("\t", ' ')
    
    tag_cleaner = re.compile('<.*?>')
    clean_text = re.sub(tag_cleaner, '', text)
    return clean_text

In [118]:
def get_doc_lang_tags(parsed_xml_content, src_lang_tags, tgt_lang_tags):
    segments = parsed_xml_content.findAll("tuv")
    try:
        seg0_tag = segments[0]['xml:lang']
        seg1_tag = segments[1]['xml:lang']
    except:
        return "", ""
    
    
    if seg0_tag == seg1_tag:
        print("First two segments have the same language tag")
        return "", ""

    if seg0_tag in src_lang_tags and seg1_tag in tgt_lang_tags:
        doc_src_tag = segments[0]['xml:lang']
        doc_tgt_tag = segments[1]['xml:lang']
        return doc_src_tag, doc_tgt_tag
    elif seg1_tag in src_lang_tags and seg0_tag in tgt_lang_tags:
        doc_src_tag = segments[1]['xml:lang']
        doc_tgt_tag = segments[0]['xml:lang']
        return doc_src_tag, doc_tgt_tag
    
    #Something's wrong
    if seg0_tag not in src_lang_tags and seg0_tag not in tgt_lang_tags:
        print("Unrecognized language tag", seg0_tag)
    if seg1_tag not in src_lang_tags and seg1_tag not in tgt_lang_tags:
        print("Unrecognized language tag", seg1_tag)
    
    return "", ""


In [122]:
src_sents = []
tgt_sents = []
no_processed_docs = 0

for tmx_path in tmx_paths:
    print("Processing", os.path.basename(tmx_path))
    
    doc_src_sents = []
    doc_tgt_sents = []
    doc_problems = 0
    with open(tmx_path) as f:
        content = f.read()
        
        soup = BeautifulSoup(content, features="lxml")
        
        doc_src_tag, doc_tgt_tag = get_doc_lang_tags(soup, src_lang_tags, tgt_lang_tags)
        if not doc_src_tag or not doc_tgt_tag:
            print("\tCouldn't get language tags from document", os.path.basename(tmx_path))
            continue

        skip_file = False
        segments = soup.findAll("tu")
        for segment in segments:
            try:
                src_text = segment.find("tuv", {"xml:lang": doc_src_tag}).seg.text
                tgt_text = segment.find("tuv", {"xml:lang": doc_tgt_tag}).seg.text

                doc_src_sents.append(clean_sent(src_text))
                doc_tgt_sents.append(clean_sent(tgt_text))
            except Exception as e:
                print("\tProblem at segment", segment['tuid'])
                doc_problems += 1

        if skip_file:
            print("\tProblem in file ", tmx_path)
            continue

        if not len(doc_src_sents) == len(doc_tgt_sents):
            print("\t# sentences don't match in file ", tmx_path)
            continue
            
        print("\tExtracted %i segments"%len(doc_src_sents), end="")
        if doc_problems:
            print("(Found %i problems)"%doc_problems)
        else:
            print("")
            
        src_sents.extend(doc_src_sents)
        tgt_sents.extend(doc_tgt_sents)
        
        no_processed_docs += 1
        
print("\nProcessed %i documents. %i segments in total"%(no_processed_docs, len(src_sents)))

Processing UNICEF Nigeria IOM ECHO_Working-1478647.tmx
	Extracted 451 segments
Processing Save the Children Nigeria IOM_ECHO_Working-1478585.tmx
	Extracted 202 segments
Processing Green Concern for Development (GREENCODE) DFID Nigeria_Working-1496258.tmx
	Extracted 4 segments
Processing WASH sector Nigeria ECHO_Working-1478664.tmx
	Extracted 291 segments
Processing International Rescue Committee (IRC) - Nigeria - DFID_Working-1478518.tmx
	Extracted 335 segments
Processing Intersos DFID Nigeria_Working-1488860.tmx
	Extracted 10 segments
Processing IOM Nigeria ECHO_IOM_Working-1478498.tmx
	Extracted 1071 segments


Processed 7 documents. 2364 segments in total


In [120]:
with open(src_output_path, 'w') as f_src, open(tgt_output_path, 'w') as f_tgt:
    for s, t in zip(src_sents, tgt_sents):
        f_src.write(s + "\n")
        f_tgt.write(t + "\n") 