# IBM 1 and 2 

f is the source language

e is the target language


data representation: (source, target, alignment)


In [1]:
import random
import numpy as np
import math
import time
from aer import *
import codecs
from abc import ABC
import pickle as cPickle
import os
from scipy.special import psi, gammaln


# READING

TRAINING_DIRECTORY = 'data/training/'
TRAINING_ENGLISH_FILENAME = 'hansards.36.2.e'
TRAINING_FRENCH_FILENAME = 'hansards.36.2.f'
TRAINING_PAIR_FILENAME = 'training_pairs'

VALIDATION_DIRECTORY = 'data/validation/'
VALIDATION_ENGLISH_FILENAME = 'dev.e'
VALIDATION_FRENCH_FILENAME = 'dev.f'
VALIDATION_PAIR_FILENAME = 'validation_pairs'
VALIDATION_ALIGNMENTS_FILENAME = 'dev.wa.nonullalign'

TEST_DIRECTORY = 'data/testing/'
TEST_ENGLISH_FILENAME = 'test/test.e'
TEST_FRENCH_FILENAME = 'test/test.f'
TEST_ALIGNMENTS_FILENAME = 'answers/test.wa.nonullalign'

# WRITING

OUTPUT_DIR = "deliverables/"

TEST_ALIGNMENTS_OUTPUT_IBM1 = "ibm1.mle.naacl"
TEST_ALIGNMENTS_OUTPUT_IBM1B = "ibm1.vb.naacl"
TEST_ALIGNMENTS_OUTPUT_IBM2 = "ibm2.mle.naacl"



## Read data

In [2]:
def get_AER(predictions, test):
    metric = AERSufficientStatistics()

    for gold, pred in zip(test, predictions):
        metric.update(sure=gold[0], probable=gold[1], predicted=pred)

    return metric.aer()

# todo add test
def read_data(english_file, french_file, save = 0):
    with open(english_file) as f:
        sentences_english = f.read().splitlines()
    with open(french_file) as f:
        sentences_french = f.read().splitlines()

    paired = []
    for i, sentence_english in enumerate(sentences_english):
        paired.append([("null " + sentence_english).split(" ")[0:-1],
                       sentences_french[i].split(" ")[0:-1]])
    
    if save:
        cPickle.dump(training_data, open(str(TRAINING_DIRECTORY + TRAINING_PAIR_FILENAME), "wb"))
        cPickle.dump(validation_data, open(str(VALIDATION_DIRECTORY + VALIDATION_PAIR_FILENAME), "wb"))

    return paired

def get_validation_alignments(path = VALIDATION_DIRECTORY + VALIDATION_ALIGNMENTS_FILENAME):
    validation_alignments = read_naacl_alignments(path)
    return validation_alignments

def get_vocabulary_size( data):
    frenchWords = []
    for pair in (data):
        for word in pair[1]:
            frenchWords.append(word)
    return len(frenchWords)

def save_as_naacl(alignments, model_name):
    converted = ""
    for i, sentence in enumerate(alignments):
        sentence = sorted(list(sentence))
        for alignment in sentence:
            converted += str(i+1) + ' ' + str(alignment[0]) + ' ' + str(alignment[1]) + ' ' + 'S' + '\n'

    filepath = OUTPUT_DIR + model_name
    
    file = open(filepath, "w")
    file.write(converted)
    return 

## IBM base


In [3]:
class IBM_base(ABC):
    UNIFORM_INIT = "uniform"
    RANDOM_INIT = "random"

    init_method = None
    model_name = None
    t = []
    train_data = []
    val_data = []
    val_alignments = []

    def train(self):
        pass

    def get_alignments(self):
        pass

    def evaluate(self):
        pass
    
    def calculate_log_likelihood(self):
        pass
    
    def evaluate_train_epoch(self):
        pass

    def set_t(self, t):
        self.t = t
        
    def empty_init(self, train_data):
        t = {}
        for i, pair in enumerate(train_data):
            for english_word in pair[0]:
                if english_word not in t:
                    t[english_word] = {}
                for french_word in pair[1]:
                    t[english_word][french_word] = 0

        return t

    def uniform_init(self, t):
        # Uniform init of the translation probabilities
        new_t = {}

        for key in t:
            new_t[key] = {}
            vocab_size = len((t[key].keys()))

            for sec_key in t[key]:
                new_t[key][sec_key] = 1.0 / vocab_size
        
        self.t = new_t

    def random_init(self, t):
        # Random init of the translation probabilities
        new_t = {}
        
        for key in t:
            new_t[key] = {}

            for sec_key in t[key]:
                new_t[key][sec_key] = random.random()

            normalizer = sum(new_t[key].values())
            new_t[key] = {k: v / normalizer for k, v in new_t[key].items()}
        
        self.t = new_t
            
    def init_empty_english_counts(self, t):
        empty_counts = {}
        english_empty_counts = {}

        for key in t:
            empty_counts[key] = {}
            english_empty_counts[key] = 0.0
            for sec_key in t[key]:
                empty_counts[key][sec_key] = 0.0

        return english_empty_counts, empty_counts




## IBM1

In [4]:
class IBM1(IBM_base):

    def __init__(self, train_data, val_data, val_alignments, init_method="uniform"):
        self.init_method = init_method
        self.model_name = "IBM1"
        self.train_data = train_data
        self.val_data = val_data
        self.val_alignments = val_alignments
        
        t = self.empty_init(train_data)
        
        if self.init_method == self.UNIFORM_INIT:
            self.uniform_init(t)
        elif self.init_method == self.RANDOM_INIT:
            self.random_init(t)
        else:
            print("Invalid init method, defaulting to uniform")
            self.init_method = self.UNIFORM_INIT
            self.uniform_init(t)

    def get_alignments(self, val_pairs, t):
        """Get the predicted alignments on sentence pairs from a trained ibm model 1 or 2"""
        alignments = []
        for k, val_pair in enumerate(val_pairs):
            alignments.append(set())
            for j, french_word in enumerate(val_pair[1]):
                max_prob = 0.0
                alignment = 0

                for i, english_word in enumerate(val_pair[0]):
                    if english_word in t:
                        if french_word in t[english_word]:
                            align_prob = t[english_word][french_word]

                    if align_prob > max_prob:
                        max_prob = align_prob
                        alignment = i
                if alignment is not 0:
                    alignments[k].add((alignment, j + 1))

        return alignments

    def evaluate_train_epoch(self, t):
        predictions = self.get_alignments(self.val_data, t)

        aer = get_AER(predictions, self.val_alignments)

        return aer
    
    def evaluate(self, t, data, alignments):
        predictions = self.get_alignments(data, t)
        
        aer = get_AER(predictions, alignments)
        
        return aer
        
    def calculate_log_likelihood(self, t):
        log_likelihood = 0
        alignments  = self.get_alignments(self.train_data, t)
        
        for k, alignment in enumerate(alignments):
            prob = 0
            e = self.train_data[k][0]
            f = self.train_data[k][1]
            
            for j, i in alignment:
                prob += np.log(t[e[j]][f[i - 1]])
            
            log_likelihood += prob
        
        return log_likelihood / len(self.train_data)
        
    def train(self, treshold, aer_epochs_treshold = 5):
        print("Started training " + self.model_name)

        log_likelihood = []
        aers = []
        
        t = self.t
        best_t = t

        number_of_sentences = len(self.train_data)
        min_aer = float('inf')
        epoch = 0

        english_empty_counts, empty_counts = self.init_empty_english_counts(t)
        # computing empty counts
        empty_counts = {}
        english_empty_counts = {}
        for key in t:
            empty_counts[key] = {}
            english_empty_counts[key] = 0.0
            for secKey in t[key]:
                empty_counts[key][secKey] = 0.0


        converged = False

        while not converged:
            start = time.time()
            log_like = 0
            epoch += 1

            counts = empty_counts
            english_counts = english_empty_counts

            # Expectation - step
            for pair in self.train_data:

                for j, french_word in enumerate(pair[1]):
                    normalizer = 0.0
                    for i, english_word in enumerate(pair[0]):
                        normalizer += t[english_word][french_word]

                    for i, english_word in enumerate(pair[0]):
                        delta = t[english_word][french_word] / normalizer
                        counts[english_word][french_word] += delta
                        english_counts[english_word] += delta
            
            # Maximization - step 
            for english_word in t:
                for french_word in t[english_word]:
                    t[english_word][french_word] = counts[english_word][french_word] / english_counts[english_word]

            
            log_likelihood.append(self.calculate_log_likelihood(t))

            aer = self.evaluate_train_epoch(t)
            aers.append(aer)

            if aer < min_aer:
                min_aer = aer
                best_t = t

            if epoch > aer_epochs_treshold:
                if len(log_likelihood) > 1:
                    diff = log_likelihood[-1] - log_likelihood[-2]
                    if diff < treshold:
                        converged = True

            end = time.time()
            print("epoch: ", epoch, " aer: ", aer, " loglikelihood: ", log_likelihood[-1], " time: ", end - start)

        self.t = t

        return t, best_t


## IBM2


In [5]:
class IBM2(IBM_base):
    IBM1_INIT = "ibm1_init"

    a = []

    def __init__(self, train_data, val_data, val_alignments, init_method="uniform", t=dict):
        self.init_method = init_method
        self.model_name = 'IBM2'
        self.train_data = train_data
        self.val_data = val_data
        self.val_alignments = val_alignments

        if init_method == self.UNIFORM_INIT:
            t = self.empty_init(train_data)
            self.uniform_init(t)
        elif init_method == self.RANDOM_INIT:
            t = self.empty_init(train_data)
            self.random_init(t)
        elif init_method == self.IBM1_INIT:
            self.set_t(t)
        else:
            print("Invalid init method, defaulting to uniform init")
            self.init_method = self.UNIFORM_INIT
            self.uniform_init(t)

    def set_a(self, a):
        self.a = a

    def init_a(self):
        ''' Initialize voger count parameter vector'''
        a_counts = {}
        a = {}

        for train_pair in self.train_data:
            I = len(train_pair[0])
            J = len(train_pair[1])

            for i, eng_word in enumerate(train_pair[0]):
                for j, french_word in enumerate(train_pair[1]):
                    a_counts[self.a_index(i, j, I, J)] = 0.0

        if self.init_method == self.UNIFORM_INIT:
            length = len(a_counts.keys())
            for key in a_counts:
                a[key] = 1.0 / length
        else:
            for key in a_counts:
                a[key] = random.random()

            normalizer = sum(a.values())
            a = {k: v / normalizer for k, v in a.items()}
            
        return a, a_counts


    def a_index(self, i, j, I, J):
        # get a index count
        return math.floor(i - (j + 1.0) * I / J)


    def get_alignments(self, val_pairs, t, a=dict):
        """Get the predicted alignments on sentence pairs from a trained ibm model 1 or 2"""
        alignments = []
        for k, val_pair in enumerate(val_pairs):
            alignments.append(set())
            I = len(val_pair[0])
            J = len(val_pair[1])

            for j, french_word in enumerate(val_pair[1]):
                max_prob = 0.0
                align_prob = float('-inf')
                alignment = 0

                for i, english_word in enumerate(val_pair[0]):
                    if english_word in t:
                        if french_word in t[english_word]:
                            align_prob = t[english_word][french_word] * a[self.a_index(i, j, I, J)]

                    if align_prob > max_prob:
                        max_prob = align_prob
                        alignment = i
                if alignment is not 0:
                    alignments[k].add((alignment, j + 1))

        return alignments
    
    def calculate_log_likelihood(self, t, a):
        log_likelihood = 0
        alignments  = self.get_alignments(self.train_data, t, a)
        
        for k, alignment in enumerate(alignments):
            prob = 0
            e = self.train_data[k][0]
            f = self.train_data[k][1]
            I = len(e)
            J = len(f)
            
            for i,j in alignment:
                a_index = self.a_index(i ,j - 1,I,J)
                a_value = a[a_index]
                prob += np.log(t[e[i]][f[j - 1]] * a_value)
            
            log_likelihood += prob
        
        return log_likelihood / len(self.train_data)

    def evaluate_train_epoch(self, t, a):
        predictions = self.get_alignments(self.val_data, t, a)

        aer = get_AER(predictions, self.val_alignments)

        return aer
    
    def evaluate(self, t, a, data, alignments):
        predictions = self.get_alignments(data, t, a)
        
        aer = get_AER(predictions, alignments)
        
        print("Test AER " + self.model_name + "-" + self.init_method + ": " + str(aer))
        print("")
        
        return aer

    def train(self, treshold, aer_epochs_treshold = 5):
        print("Started training " + self.model_name + "-" + self.init_method)
    
        log_likelihood = []
        aers = []

        t = self.t
        best_t = self.t
        a, a_counts = self.init_a()
        best_a = a

        number_of_sentences = len(self.train_data)
        min_aer = float('inf')
        epoch = 0

        english_empty_counts, empty_counts = self.init_empty_english_counts(t)

        converged = False

        while not converged:
            start = time.time()
            log_like = 0
            epoch += 1

            counts = empty_counts
            english_counts = english_empty_counts

            # Expectation - step
            for train_pair in self.train_data:
                I = len(train_pair[0])
                J = len(train_pair[1])


                for j, french_word in enumerate(train_pair[1]):
                    normalizer = 0.0

                    for i, english_word in enumerate(train_pair[0]):
                        a_index = self.a_index(i, j, I, J)
                        normalizer += t[english_word][french_word] * a[a_index]

                    for i, english_word in enumerate(train_pair[0]):
                        a_index = self.a_index(i, j, I, J)
                        delta = a[a_index] * t[english_word][french_word] / normalizer

                        a_counts[a_index] += delta
                        counts[english_word][french_word] += delta
                        english_counts[english_word] += delta

            # Maximization - step
            for english_key in t:
                for french_key in t[english_key]:
                    t[english_key][french_key] = counts[english_key][french_key] / english_counts[english_key]

            normalizer = sum(a_counts.values())
            a = {k: v / normalizer for k, v in a_counts.items()}

            log_likelihood.append(self.calculate_log_likelihood(t,a))

            aer = self.evaluate_train_epoch(t, a)
            aers.append(aer)

            if aer < min_aer:
                min_aer = aer
                best_a = a
                best_t = t

            if epoch > aer_epochs_treshold:
                if len(log_likelihood) > 1:
                    diff = log_likelihood[-1] - log_likelihood[-2]
                    if diff < treshold:
                        converged = True

            end = time.time()
            print("epoch: ", epoch, " aer: ", aer, " loglikelihood: ", log_likelihood[-1], " time: ", end - start)
        
        self.t = t
        self.a = a
        
        return t, a, best_t, best_a


## IBM Variational Bayes

In [6]:
class IBM1_bayesian(IBM_base):
    BAYESIAN_INIT = "bayesian"

    u = []
    alpha = 0
    french_words = 0

    def __init__(self, train_data, val_data, val_alignments, french_words = 0, alpha=0.0005, init_method="bayesian"):
        self.init_method = init_method
        self.model_name = 'IBM1_bayesian'
        self.train_data = train_data
        self.val_data = val_data
        self.val_alignments = val_alignments
        self.alpha = alpha
        self.french_words = french_words

        t = self.empty_init(train_data)

        if self.init_method == self.BAYESIAN_INIT:
            self.bayes_init(t)


    def bayes_init(self,t):
        trans = {}
        unseen = {}

        for key in t:
            trans[key] = {}
            unseen[key] = 1.0 / self.french_words

            for sec_key in t[key]:
                trans[key][sec_key] = 1.0 / self.french_words

        self.t = trans
        self.u = unseen

    def bayesian_maximization(self, counts, normalizer):
        return psi(counts + self.alpha) - psi(normalizer + self.alpha * self.french_words)

    def evaluate_train_epoch(self, t, u):
        predictions = self.get_alignments(self.val_data, t, u)

        aer = get_AER(predictions, self.val_alignments)

        return aer
    
    def evaluate(self, t, u, data, alignments):
        predictions = self.get_alignments(data, t, u)
        
        aer = get_AER(predictions, alignments)
        
        print("Test AER " + self.model_name + "-" + str(self.alpha) + ": " + str(aer))
        print("")
        
        return aer

    def train(self, treshold, aer_epochs_treshold = 5):
        print("Started training " + self.model_name + "-" + str(self.alpha))

        log_likelihood = []
        aers = []
        min_aer = float('inf')
        epoch = 0

        t = self.t
        u = self.u
        best_t = self.t
        best_u = self.u

        number_of_sentences = len(self.train_data)

        converged = False

        english_empty_counts, empty_counts = self.init_empty_english_counts(t)

        while not converged:
            start = time.time()
            epoch += 1
            log_like = 0

            counts = empty_counts
            english_counts = english_empty_counts

            for train_pair in self.train_data:
                for j, french_word in enumerate(train_pair[1]):
                    normalizer = 0.0

                    for i, english_word in enumerate(train_pair[0]):
                        normalizer += t[english_word][french_word]

                    log_like += np.log(normalizer)

                    for i, english_word in enumerate(train_pair[0]):
                        delta = t[english_word][french_word] / normalizer
                        counts[english_word][french_word] += delta
                        english_counts[english_word] += delta

            for english_word in t:
                u[english_word] = self.bayesian_maximization(0, english_counts[english_word])
                for french_word in t[english_word]:
                    t[english_word][french_word] = self.bayesian_maximization(counts[english_word][french_word], english_counts[english_word])
            
            log_likelihood.append(self.calculate_log_likelihood(log_like, t, counts))

            aer = self.evaluate_train_epoch(t, u)
            aers.append(aer)

            if aer < min_aer:
                min_aer = aer
                best_u = u
                best_t = t

            if epoch > aer_epochs_treshold:
                if len(log_likelihood) > 1:
                    diff = log_likelihood[-1] - log_likelihood[-2]
                    if diff < treshold:
                        converged = True

            end = time.time()
            print("epoch: ", epoch, " aer: ", aer, " loglikelihood: ", log_likelihood[-1], " time: ", end - start)
        
        self.t = t
        self.u = u
        
        return t, u, best_t, best_u

    def get_alignments(self, val_pairs, t, u=dict):
        """Get the predicted alignments on sentence pairs from a trained ibm model 1 or 2"""
        alignments = []
        for k, val_pair in enumerate(val_pairs):
            alignments.append(set())
            I = len(val_pair[0])
            J = len(val_pair[1])

            for j, french_word in enumerate(val_pair[1]):
                max_prob = 0.0
                align_prob = float('-inf')
                alignment = 0

                for i, english_word in enumerate(val_pair[0]):
                    if english_word in t:
                        if french_word in t[english_word]:
                            align_prob = t[english_word][french_word]
                        else:
                            align_prob = u[english_word]
                    if align_prob > max_prob:
                        max_prob = align_prob
                        alignment = i
                if alignment is not 0:
                    alignments[k].add((alignment, j + 1))

        return alignments

    def calculate_log_likelihood(self, log_like, t, counts):
        alpha = self.alpha
        gamma_alpha = gammaln(alpha)

        for english_word, english_probs in t.items():
            lamb = 0

            for french_word, french_probs in english_probs.items():
                t[english_word][french_word] = np.exp(french_probs)
                count = counts[english_word][french_word]

                log_like += (french_probs * (-count) + gammaln(alpha + count) - gamma_alpha)
                lamb += count

            lamb += self.french_words * alpha
            log_like += gammaln(alpha * self.french_words) - gammaln(lamb)

        return log_like / len(self.train_data)



# IBM Models experiments

## Read in data and define global constants

In [7]:
AER_EPOCHS_TRESHOLD = 5

train_pairs = read_data(TRAINING_DIRECTORY + TRAINING_ENGLISH_FILENAME, TRAINING_DIRECTORY + TRAINING_FRENCH_FILENAME)
val_pairs = read_data(VALIDATION_DIRECTORY + VALIDATION_ENGLISH_FILENAME, VALIDATION_DIRECTORY + VALIDATION_FRENCH_FILENAME)
test_pairs = read_data(TEST_DIRECTORY + TEST_ENGLISH_FILENAME, TEST_DIRECTORY + TEST_FRENCH_FILENAME)

val_alignments = get_validation_alignments()
test_alignments = get_validation_alignments(TEST_DIRECTORY + TEST_ALIGNMENTS_FILENAME)

## IBM1 TRAINING

In [8]:
TRESHOLD = 1

ibm1 = IBM1(train_pairs, val_pairs, val_alignments)
t_ibm1, best_t_ibm1 = ibm1.train(TRESHOLD, AER_EPOCHS_TRESHOLD)

ibm1.evaluate(best_t_ibm1, test_pairs, test_alignments)

Started training IBM1
epoch:  1  aer:  0.37238095238095237  loglikelihood:  -70.10828042092986  time:  190.25556135177612
epoch:  2  aer:  0.3262955854126679  loglikelihood:  -55.27673877324344  time:  183.75935697555542
epoch:  3  aer:  0.31892411143131605  loglikelihood:  -49.65922203069056  time:  183.7144718170166
epoch:  4  aer:  0.3150816522574448  loglikelihood:  -46.65037438364726  time:  185.4859380722046
epoch:  5  aer:  0.3112391930835735  loglikelihood:  -44.79100360891826  time:  183.15603828430176
epoch:  6  aer:  0.3089509143407122  loglikelihood:  -43.48387748603292  time:  177.70188283920288
epoch:  7  aer:  0.3108758421559191  loglikelihood:  -42.50796801564615  time:  193.95269203186035


0.2966276668960771

## IBM2 TRAINING

In [9]:
TRESHOLD = 2
init_methods = ["ibm1_init", "uniform", "random1", "random2", "random3"]
results = {"uniform" : {}, "random1": {}, "random2": {}, "random3": {}, "ibm1_init": {}}


for init_method in init_methods:
    if "random" in init_method:
        random_iteration = init_method[-1]
        init_method = "random"
    else:
        random_iteration = ""
        
    ibm2 = IBM2(train_pairs, val_pairs, val_alignments, init_method, best_t_ibm1)
    t, a, best_t, best_a = ibm2.train(TRESHOLD, AER_EPOCHS_TRESHOLD)
    
    test_aer = ibm2.evaluate(best_t, best_a, test_pairs, test_alignments)

    results[init_method + random_iteration]['a'] = a
    results[init_method + random_iteration]['t'] = t
    results[init_method + random_iteration]['best_t'] = best_t
    results[init_method + random_iteration]['best_a'] = best_a
    results[init_method + random_iteration]['test_aer'] = test_aer

Started training IBM2-ibm1_init
epoch:  1  aer:  0.35838150289017345  loglikelihood:  -101.77492484493682  time:  385.15889501571655
epoch:  2  aer:  0.31160115052732507  loglikelihood:  -94.89890847527597  time:  358.75532817840576
epoch:  3  aer:  0.2545454545454545  loglikelihood:  -91.038803991066  time:  346.1818251609802
epoch:  4  aer:  0.25982742090124644  loglikelihood:  -88.47934333418031  time:  341.8149130344391
epoch:  5  aer:  0.2569511025886865  loglikelihood:  -86.61207143846023  time:  340.77851390838623
epoch:  6  aer:  0.25047984644913623  loglikelihood:  -85.26053332121208  time:  344.48517203330994
Test AER IBM2-ibm1_init: 0.22195395360869186

Started training IBM2-uniform
epoch:  1  aer:  0.2857142857142857  loglikelihood:  -133.66222045813393  time:  374.9105188846588
epoch:  2  aer:  0.24282982791586993  loglikelihood:  -107.83203042557234  time:  370.7394459247589
epoch:  3  aer:  0.23823246878001925  loglikelihood:  -97.63920258332907  time:  371.2373161315918

## IBM Bayesian TRAINING

In [10]:
TRESHOLD = 1
french_words = get_vocabulary_size(train_pairs)
results = {}

for alpha in [0.0005, 0.005, 0.05]:
    results[alpha] = {}
    
    ibm1_bayesian = IBM1_bayesian(train_pairs, val_pairs, val_alignments, french_words, alpha)
    t_bayesian, u_bayesian, best_t_bayesian, best_u_bayesian = ibm1_bayesian.train(TRESHOLD, AER_EPOCHS_TRESHOLD)
    test_aer = ibm1_bayesian.evaluate(best_t_bayesian, best_u_bayesian, test_pairs, test_alignments)
    
    results[alpha]["t"] = t_bayesian
    results[alpha]["u"] = u_bayesian
    results[alpha]["best_t_bayesian"] = best_t_bayesian
    results[alpha]["best_u_bayesian"] = best_u_bayesian
    results[alpha]["test_aer"] = test_aer
    

Started training IBM1_bayesian-0.0005
epoch:  1  aer:  0.37845567206863684  loglikelihood:  -440.2746923840363  time:  179.78928422927856
epoch:  2  aer:  0.3621517771373679  loglikelihood:  -265.80112017365946  time:  248.16629004478455
epoch:  3  aer:  0.35796545105566224  loglikelihood:  -255.21125918693096  time:  244.26867389678955
epoch:  4  aer:  0.35700575815738966  loglikelihood:  -254.06164601621165  time:  249.41818284988403
epoch:  5  aer:  0.3541266794625719  loglikelihood:  -255.0378803136446  time:  252.44468903541565
epoch:  6  aer:  0.35158501440922185  loglikelihood:  -256.73724036676913  time:  229.5512731075287
Test AER IBM1_bayesian-0.0005: 0.32210254200775523

Started training IBM1_bayesian-0.005
epoch:  1  aer:  0.41522157996146436  loglikelihood:  -336.80185148632097  time:  178.47593903541565
epoch:  2  aer:  0.39633558341369335  loglikelihood:  -177.51317332334577  time:  308.02882194519043
epoch:  3  aer:  0.3876567020250723  loglikelihood:  -171.797370320308

## Train best models + NAACL generation 

### IBM1 

In [12]:
alignments = ibm1.get_alignments(test_pairs, best_t_ibm1)
save_as_naacl(alignments, TEST_ALIGNMENTS_OUTPUT_IBM1)


### IBM2

In [13]:
TRESHOLD = 2

ibm2 = IBM2(train_pairs, val_pairs, val_alignments, "uniform")
t, a, best_t, best_a = ibm2.train(TRESHOLD, AER_EPOCHS_TRESHOLD)

alignments = ibm2.get_alignments(test_pairs, best_t, best_a)
save_as_naacl(alignments, TEST_ALIGNMENTS_OUTPUT_IBM2)


Started training IBM2-uniform
epoch:  1  aer:  0.2857142857142857  loglikelihood:  -133.66222045813393  time:  386.3716850280762
epoch:  2  aer:  0.24282982791586993  loglikelihood:  -107.83203042557234  time:  356.21409368515015
epoch:  3  aer:  0.23823246878001925  loglikelihood:  -97.63920258332907  time:  370.06926012039185
epoch:  4  aer:  0.2336538461538461  loglikelihood:  -92.61763210678565  time:  391.10724210739136
epoch:  5  aer:  0.23173076923076918  loglikelihood:  -89.55901670904626  time:  377.58235716819763
epoch:  6  aer:  0.2288461538461538  loglikelihood:  -87.51451239126011  time:  371.52227568626404
epoch:  7  aer:  0.2283236994219653  loglikelihood:  -86.00957334626911  time:  361.2159032821655


### IBM1 Bayesian

In [14]:
TRESHOLD = 1
french_words = get_vocabulary_size(train_pairs)

ibm1_bayesian = IBM1_bayesian(train_pairs, val_pairs, val_alignments, french_words, 0.0005)
t_bayesian, u_bayesian, best_t_bayesian, best_u_bayesian = ibm1_bayesian.train(TRESHOLD, AER_EPOCHS_TRESHOLD)

alignments = ibm1_bayesian.get_alignments(test_pairs, best_t_bayesian, best_u_bayesian)
save_as_naacl(alignments, TEST_ALIGNMENTS_OUTPUT_IBM1B)

Started training IBM1_bayesian-0.0005
epoch:  1  aer:  0.37845567206863684  loglikelihood:  -440.2746923840363  time:  180.38955330848694
epoch:  2  aer:  0.3621517771373679  loglikelihood:  -265.80112017365946  time:  230.85887098312378
epoch:  3  aer:  0.35796545105566224  loglikelihood:  -255.21125918693096  time:  247.99844408035278
epoch:  4  aer:  0.35700575815738966  loglikelihood:  -254.06164601621165  time:  272.77419424057007
epoch:  5  aer:  0.3541266794625719  loglikelihood:  -255.0378803136446  time:  252.38278603553772
epoch:  6  aer:  0.35158501440922185  loglikelihood:  -256.73724036676913  time:  247.5081343650818


### IBM1 LL, AER TEST

In [15]:
test_aer_1_log = ibm1.evaluate(t, test_pairs, test_alignments)
test_aer_1_aer = ibm1.evaluate(best_t_ibm1, test_pairs, test_alignments)

### IBM2 LL, AER TEST

In [16]:
test_aer_2_log = ibm2.evaluate(t, a, test_pairs, test_alignments)
test_aer_2_aer = ibm2.evaluate(best_t, best_a, test_pairs, test_alignments)


Test AER IBM2-uniform: 0.20706065587955347

Test AER IBM2-uniform: 0.20706065587955347



### IBM 1 B, AER TEST

In [17]:
test_aer_b_ll = ibm1_bayesian.evaluate(t_bayesian, u_bayesian, test_pairs, test_alignments)
test_aer_b_aer = ibm1_bayesian.evaluate(best_t_bayesian, best_u_bayesian, test_pairs, test_alignments)


Test AER IBM1_bayesian-0.0005: 0.32210254200775523

Test AER IBM1_bayesian-0.0005: 0.32210254200775523

