In [8]:
# Using NLTK package as a toolkit for working with NLP in Python. 
# It provides various text processing libraries with a lot of test datasets.
import nltk
#nltk.download()
import re
import codecs

# Using the nltk.translate packages IBMModel1, Alignment and IBMModel1 for testing the translation between 2 languages corpus using IBM model 1.
from nltk.translate import AlignedSent, Alignment, IBMModel1

# Defaultdict is a sub-class of the dictionary class that returns a dictionary-like object.
# Using it for for making higher dimensional dictionaries.
from collections import defaultdict

import json # Extracting data from the .json files

source_language = 'ar' # The source language tag for Amharic language.
translation_language = 'tr' # The translation language tag for English language.
FILE = 'am_tr_pair_small.json'#data file with the translations
iterations_num = 2 # Number of iterations to run the EM algorithm.

englishPreposition=["a","the","is","it","be","of","off","they","has","then","be","been"]

def fileOpener(FILEZ):
    with open(FILEZ, 'r', encoding = 'utf-8') as f:
        gcorpus = json.load(f)
    return gcorpus

def word_tag(corpus):
    corpus_words = defaultdict(set)
    for x in corpus:
        for y in x:
            words = x[y].split()
            #print(words)
            for word in words:
                
                #if word.lower() not in englishPreposition:
                #print(word)
                
                corpus_words[y].add(word)
    return corpus_words

def ibm1_EM(word_tags):
    
    sentence_pair = word_tag(word_tags) # Variable containing the pair of sentences (source & translation) with tags. 
    s_total = {} # Initializing an array for s_total.
    #print(sentence_pair)
    # Sorce Language reading here.
    for word in sentence_pair[source_language]:
        s_total[word] = 0.0
        #print(word)
    
    # Initialize t(e|f) uniformly.
    uniformly_dict = defaultdict(dict) # Creating a variable for the dictionary.
    for word_source in sentence_pair[source_language]: # for loop which runs over all words with tags as 'ar'.
        for word_translation in sentence_pair[translation_language]: # for loop which runs over all words with tags as 'tr'.
            #print("----",word_source,word_translation,len(sentence_pair[source_language]))
            if word_translation not in englishPreposition:
                uniformly_dict[word_source][word_translation] = 1 / len(sentence_pair[source_language]) # The transition_probabilities to 1/(length of the sentence in the source language).
    init_prob = uniformly_dict # The initial probability.
    iterations = 1 # Starting the iteration as 0.
    
    
    # While not converged do:
    while iterations <= iterations_num:
        current_prob = init_prob # A variable containing the initial probability.
        count = defaultdict(dict) # Creating a count variable containing a dictionary for all e and f values.
        total = defaultdict(dict) # Creating a total variable containing a dictionary for all f values.

        # Initializing count:
        # Nested for loop which runs over all words in both amharic and English languages and assigning their weight as 0.
        for word_source in sentence_pair[source_language]:
            for word_translation in sentence_pair[translation_language]:
                count[word_source][word_translation] = 0 # Assigning the weight of all words as 0.

        # Initializing total:
        # for loop which runs over all words in English language and assigning their weight as 0.
        for word in sentence_pair[translation_language]:
            total[word] = 0 # Assigning the weight of all words in the Turkish language as 0.


        # For all sentence pair (e|f) do:
        # Compute normalization:
        #print("Iteration ", "(", iterations, ")")
        for (es, fs) in [(pair[source_language].split(), pair[translation_language].split()) for pair in word_tags]:
            #print(es,"<<<>>>",fs)
            for e in es:  # For all words e in es do:
                #print("stotal is",s_total[e])
                s_total[e] = 0 # Intitialize s_total as 0.
                
                for f in fs: # For all words f in fs do:
                    #print(e,f,current_prob[e][f])
                    if f not in englishPreposition:
                        s_total[e] += current_prob[e][f] # Updating the initial probability.
                    #print("----->>>>",s_total[e])
            #print("s-total(e):: ", "ar:",es, "||", "tr:", fs, "==> ", s_total[e]) # Printing s-total(e).
            
                    

            # Collect counts:
            for e in es: # For all words e in e do: source language word
                for f in fs: # For all words f in f do:
                    if f not in englishPreposition:

                        count[e][f] += (current_prob[e][f] / s_total[e])
                        total[f] += current_prob[e][f] / s_total[e]


        # Estimate probabilities:
        for f in sentence_pair[translation_language]: # For all English words f do:
            for e in sentence_pair[source_language]: # For all Amharic words e do:
                if f not in englishPreposition:
                    #print("target",f,"source",e,current_prob[e][f],"-",count[e][f],"-",total[f])
                    current_prob[e][f] = count[e][f] / total[f]
                
                
        #print("--------------------------------------------------------------------------")
        
        init_prob = current_prob # Updating the init_prob to the value of the current_prob.
        iterations += 1 # increament for the number of iteration inside the while loop.

    return current_prob # Returning the value of the current_prob.

def maximum_prob(current_prob):
    prob_result = {} # Initializing an array for prob_result.
    
    source_key = current_prob.keys() # .keys() method returns a view object that displays a list of all the keys in the dictionary in order of insertion.
    translation_word = list(current_prob.values()) # A variable which stores the values of the dictionary inside a list.
    translation_key = translation_word[0].keys() # A variable which extract the keys of the dictionary starting from position 0.

    for word_source in source_key: # for loop which runs over all the keys titled as 'ar' Amharic language.
        max_prob = 0.0 # Initializing the max_prob variable as 0.
        max_translation_word = "" # A variable which stores the words of English language.
        for word_translation in translation_key: # for loop which runs over all the keys titled as 'tr' English language.
            if current_prob[word_source][word_translation] >= max_prob: # if condition which takes the pair of sentences with their current_prob and compare them with the max_prob.
                max_prob = current_prob[word_source][word_translation] # if the current_prob is bigger or equal to the max_prob, then assign the pair of sentences with their current_prob as max_prob.
                max_translation_word = word_translation  # A variable which stores the words of English language.
        
        prob_result[word_source] = max_translation_word # Storing the words in an array from English language with the most probability with their corresponding from Amharic language in the same position.

    return prob_result # Returning the maximum probability.

def alignment(prob_result, word_tags):
    
    num = 1
    for x in word_tags: # for loop which runs over all the sentences.
        num += 1
        
        # Nested for loop for the alignment between each Araibc word with it's corresponding English word.
        i = 0
        for word_source in x[source_language].split():
            j = 0
            for word_translation in x[translation_language].split():
                if word_translation == prob_result[word_source]: # if condition to match each Amharic word with it's corresponding English word.
                    alignment = str(i) + "-" + str(j) + " "
                    print(alignment, end = "")
                    j = j + 1
                    break
                j = j + 1
            i = i + 1
        print("\n")

def JsonSaver(jsonName,Content):
    with codecs.open(jsonName, 'w', 'utf8') as f:
        f.write(json.dumps(Content,indent=4, sort_keys = True, ensure_ascii=False))

def amharic_english():

    ##parsing the json file and storing it in an object
    #with open(FILE, 'r', encoding = 'utf-8') as f:
    #    corpus = json.load(f)
    corpus=fileOpener(FILE)
    
    # Calling the ibm1_EM function with passing the value obtained by the word_tag function.
    ibm_em_tr = ibm1_EM(corpus)
    
    # Calling the maximum_prob function with passing the value obtained by the ibm1_EM function.
    alignment_tr = maximum_prob(ibm_em_tr)
    
    
    JsonSaver("AbebeBesoBela max probability.json",alignment_tr)
    JsonSaver("AbebeBesoBela statistics overall.json",ibm_em_tr)
    # Calling the alignment function with passing the values obtained by the maximum_prob function and the object containing sentences with tags from word_tag function.
    alignment(alignment_tr, corpus)

# Main function

In [9]:
def main():
    amharic_english() #Translation.

main()

0-2 1-2 2-2 3-2 4-2 5-2 

1-11 2-11 3-11 4-11 5-11 6-5 7-11 8-11 

0-2 1-2 2-2 3-2 4-2 5-2 

0-2 1-1 2-1 3-0 4-1 5-1 6-1 7-1 

0-4 1-2 2-2 3-2 4-2 5-2 

0-0 1-2 3-2 4-2 5-2 

0-2 1-2 2-2 3-2 

0-0 1-0 2-1 

0-0 1-0 2-1 

0-0 1-2 2-2 

0-0 1-1 2-1 

0-0 1-2 2-1 

0-0 1-0 2-1 

0-0 1-2 2-1 

0-0 1-2 2-1 

