# IBM Model 1 with Expectation Maximization

## Author: Anmol Sharma

**Open the files**

In [1]:
from __future__ import print_function
import itertools, sys
import numpy as np

In [3]:
print('Opening files..', file=sys.stderr)
src_set = open('./data/europarl.de', 'rb')
des_set = open('./data/europarl.en', 'rb')

Opening files..


**Perform Preprocessing**

1. **Split the data into two different sets, and split each sentence into words**
2. **Add a NONE character inside every english sentence**

In [4]:
src_sent = []
dest_sent = []

for line_des, line_src in zip(des_set, src_set):
    # split each sentence into a list of words for easy processing
    src_sent.append(line_src.split())
    dest_sent.append(line_des.split())

We can see the words contain many "\xc3\xa9es"... which are basically the unicode codes for special accented symbols in french. Nothing to worry. 

Also, the punctuation marks are left as it is as "words" which map directly to the punctuation in the destination language. 

In [5]:
print('Source sentences..', file=sys.stderr)
print(src_sent[5:10], file=sys.stderr)

Source sentences..
[['Ich', 'bitte', 'Sie', ',', 'sich', 'zu', 'einer', 'Schweigeminute', 'zu', 'erheben', '.'], ['(', 'Das', 'Parlament', 'erhebt', 'sich', 'zu', 'einer', 'Schweigeminute', '.', ')'], ['Frau', 'Pr\xc3\xa4sidentin', ',', 'zur', 'Gesch\xc3\xa4ftsordnung', '.'], ['Wie', 'Sie', 'sicher', 'aus', 'der', 'Presse', 'und', 'dem', 'Fernsehen', 'wissen', ',', 'gab', 'es', 'in', 'Sri', 'Lanka', 'mehrere', 'Bombenexplosionen', 'mit', 'zahlreichen', 'Toten', '.'], ['Zu', 'den', 'Attentatsopfern', ',', 'die', 'es', 'in', 'j\xc3\xbcngster', 'Zeit', 'in', 'Sri', 'Lanka', 'zu', 'beklagen', 'gab', ',', 'z\xc3\xa4hlt', 'auch', 'Herr', 'Kumar', 'Ponnambalam', ',', 'der', 'dem', 'Europ\xc3\xa4ischen', 'Parlament', 'erst', 'vor', 'wenigen', 'Monaten', 'einen', 'Besuch', 'abgestattet', 'hatte', '.']]


In [6]:
print('Destination sentences..', file=sys.stderr)
print(dest_sent[5:10], file=sys.stderr)

Destination sentences..
[['Please', 'rise', ',', 'then', ',', 'for', 'this', 'minute', "'", 's', 'silence', '.'], ['(', 'The', 'House', 'rose', 'and', 'observed', 'a', 'minute', "'", 's', 'silence', ')'], ['Madam', 'President', ',', 'on', 'a', 'point', 'of', 'order', '.'], ['You', 'will', 'be', 'aware', 'from', 'the', 'press', 'and', 'television', 'that', 'there', 'have', 'been', 'a', 'number', 'of', 'bomb', 'explosions', 'and', 'killings', 'in', 'Sri', 'Lanka', '.'], ['One', 'of', 'the', 'people', 'assassinated', 'very', 'recently', 'in', 'Sri', 'Lanka', 'was', 'Mr', 'Kumar', 'Ponnambalam', ',', 'who', 'had', 'visited', 'the', 'European', 'Parliament', 'just', 'a', 'few', 'months', 'ago', '.']]


## **We need to find the probability $t_k(f_i|e_j)$ where $f_i$ = source word and $e_j$ = destination word**

Find all the unique words in french data

In [7]:
# convert the source list into a chain of iterables, and then convert it to a set to only retain unique elements.
# further convert to list for easy processing
src_vocab = list(set(itertools.chain.from_iterable(src_sent)))
des_vocab = list(set(itertools.chain.from_iterable(dest_sent)))

In [8]:
print('Some unique source words..', file=sys.stderr)
print(src_vocab[0:5], file=sys.stderr)

Some unique source words..
['L\xc3\xa4use', 'Sonderweg', 'verschiedentlich', 'Handschlag', 'Portwein']


In [9]:
print('Some unique destination words..', file=sys.stderr)
print(des_vocab[0:5], file=sys.stderr)

Some unique destination words..
['gai', 'deferment', 'Pronk', 'woods', 'hanging']


# Start the training process..

**We cannot initialize the $t_k$ values to uniform due to memory constraints. A better way to do this is to first check if the key exists or not, and if it doesn't, then initialize it to uniform probability. This saves a huge memory and computational overhead of permuting through all $f_i$ and $e_j$ and setting them uniform, many of which will not even appear in the training text**

In [None]:
k = 0
t_k = {}
count_comb = {}
count_e = {}
uni_prob = 1.0 / np.shape(src_vocab)[0]
epochs = 5

for _i in range(epochs):
    print('Currently on training epoch {}..'.format(_i+1), file=sys.stderr)
    # iterate over all training examples
    for src_sent_eg, dest_sent_eg in zip(src_sent, dest_sent):
        for f_i in src_sent_eg:
            Z = 0.0
            for e_j in dest_sent_eg:
                
                 # initialize counts on the fl
                if (f_i, e_j) not in t_k:
#                     print('({}, {}) not in t_k, initializing to uniform!'.format(f_i, e_j))
                    t_k[(f_i, e_j)] = 1.0 / uni_prob
                
                Z += t_k[(f_i, e_j)]
            for e_j in dest_sent_eg:
                c = t_k[(f_i, e_j)] / Z
                
                # initialize counts on the fly
                if (f_i, e_j) not in count_comb:
#                     print('({}, {}) not in count_comb, initializing to 0!'.format(f_i, e_j))
                    count_comb[(f_i, e_j)] = 0
                
                # initialize counts on the fly
                if e_j not in count_e:
#                     print('({}) not in count_e, initializing to 0!'.format(e_j))
                    count_e[e_j] = 0
                    
                count_comb[(f_i, e_j)] += c
                count_e[e_j] += c
                
    print('Updating t_k counts...', file=sys.stderr)
    for f_e_keys in count_comb:
        # f_e_keys[0] = f_i, f_e_keys[1] = e_j
        t_k[(f_e_keys[0], f_e_keys[1])] = count_comb[f_e_keys] / count_e[f_e_keys[1]]

Currently on training epoch 1..


# Make predictions using this trained model..

In [None]:
print('Aligning...', file=sys.stderr)
print('Source | Destination', file=sys.stderr)
for src_sent_eg, dest_sent_eg in zip(src_sent, dest_sent):
    i = 0
    for f_i in src_sent_eg:
        bestp = 0
        bestj = 0
        j = 0
        for e_j in dest_sent_eg:
            if t_k[(f_i, e_j)] > bestp:
                bestp = t_k[(f_i, e_j)]
                bestj = e_j
                j += 1
        sys.stdout.write('{}-{} '.format(i,j))
        i += 1
    sys.stdout.write('\n')