In [14]:
from itertools import islice

def write_to_file(filename, data):
    with open(filename, "w") as file:
        for line in data: # for each set of homonyms
            to_write = ""
            # check that there is at least 2 homonyms...
            if len(line) < 2:
                continue
            # create one line    
            for i, word in enumerate(line): 
                to_write += word
                if i+1 < len(line):
                    to_write += "/"
            to_write += '\n'
            # save line to file
            file.write(to_write)
            
            
            
            
def parse_giza_output(source_file, target_file):
    # Sentence pair (1) source length 4 target length 4 alignment score : 0.0999093
    # This is fully overloaded. 
    # NULL ({ }) This ({ 1 }) is ({ 2 }) fully ({ 3 }) overloaded. ({ 4 }) 

    list_of_sets = []

    # parse giza++ output
    with open(source_file, 'r') as file:
        print("parsing source file: {} ... ".format(source_file), end="")

        while True:
            # for each aligned lines ....
            lines_gen = islice(file, 3) 

            target_sentence = None # ASR [N]
            source_sentence = None # original [1]
            for j, line in enumerate(lines_gen):
                if j == 1:
                    target_sentence = line
                if j == 2:
                    source_sentence = line

            # check that we are not at the end of the document ...
            if not target_sentence or not source_sentence:
                break

            # split target and source
            target_list = target_sentence.strip().split()
            source_list = source_sentence.strip().strip("})").strip().split(" })")

            # for each of aligned words in source
            for align in source_list:
                source_word, idx = align.split("({")
                # strip some dirt from source word
                source_word = source_word.strip().strip(".").strip().strip(",").strip().strip("\"").strip().strip(",").strip()

                # skip NULL alignment or UNKNOWN words in source sentence ...
                if  source_word == "NULL" or source_word == "<UNKNOWN>" :
                    continue

                # and aslo skip unalgined wotrds ... ({ })
                idx = idx.strip()
                if len(idx) == 0: 
                    continue

                 # 1 -> N alignment    ({ 2 5 6 })
                if len(idx.split()) > 1:

                    # build target word (N-gram)
                    target_word = "" 
                    list_idxs = idx.split()
                    use = True
                    for idxx in list_idxs:
                        assert idxx.isdigit(), "align idx must be integer !"
                        idxx = int(idxx) - 1 # idx from zero ..

                        # do not add space if not suitable ...  for example: "I'll", "zero-based" ... 
                        if target_list[idxx][0] != "'" and target_list[idxx][0] != "-": 
                            target_word += " "

                        # do not use some strange alignments ... for example"  hello / The.Hello and
                        if source_word == target_list[idxx].strip().strip(".").strip().strip(",").strip().strip("\"").strip().strip(",").strip():
                            use = False
                            break

                        target_word += target_list[idxx]

                    # strip some dirt from target N-gram
                    target_word = target_word.strip().strip(".").strip().strip(",").strip().strip("\"").strip().strip(",").strip()
                    if not use:
                        continue

                else: # 1 - 1 alignment  ({ 4 })
                    assert idx.isdigit(), "align idx must be integer !"
                    idx = int(idx) - 1  # idx from zero ..

                    # strip some dirt from target word
                    target_word = target_list[idx].strip().strip(".").strip().strip(",").strip().strip("\"").strip().strip(",").strip()

                    # only different words are intereting...
                    if source_word == target_word:
                        continue

                # print("{} --> {}".format(source_word, target_word))

                spliteed_set = set([source_word, target_word]) # read homonym group

                added = False
                for ss in list_of_sets:
                    if ss.intersection(spliteed_set):
                        added = True
                        ss = ss.union(spliteed_set)
                if not added:
                    list_of_sets.append(spliteed_set)    

    print("OK\nsaving result to: {} ... ".format(target_file), end="")
    # writing part
    write_to_file(target_file, list_of_sets)
    print("OK")
    

In [15]:

# Usage ...
parse_giza_output("word_alignments_human_asr", "giza-homonyms.txt")

parsing source file: word_alignments_human_asr ... OK
saving result to: giza-homonyms.txt ... OK
