# IBM 1 and 2, Variational Bayes

f is the source language, e the target language. In this code, we use the convention that we represent our data in the following way:
(source, target)

### 1. First we collect all imports, read in the data and initialize the system's parameters

In [None]:
from collections import Counter, defaultdict
from aer import read_naacl_alignments, AERSufficientStatistics
from scipy.special import digamma, loggamma

import random
import codecs
import math
import tqdm
import pprint
import numpy as np

# Number of iterations
S = 10

# Set paths for data
training_english_path = "training/hansards.36.2.e"
training_french_path = "training/hansards.36.2.f"
validation_english_path = "validation/dev.e"
validation_french_path = "validation/dev.f"

### 2. Read in the data

In [None]:
with codecs.open(training_english_path, 'r', 'utf8') as f:
    training_english = [line.split() for line in f.readlines()]

with codecs.open(training_french_path, 'r', 'utf8') as f:
    training_french = [line.split() for line in f.readlines()]

training_data = list(zip(training_french, training_english))

# Add NULL characters at the start of english sentences
for i, (f, e) in enumerate(training_data):
    e = ["NULL"] + e
    training_data[i] = (f, e)

with codecs.open(validation_english_path, 'r', 'utf8') as f:
    validation_english = [line.split() for line in f.readlines()]

with codecs.open(validation_french_path, 'r', 'utf8') as f:
    validation_french = [line.split() for line in f.readlines()]

validation_data = list(zip(validation_french, validation_english))

### 3. The IBM1 model EM

In [None]:
class IBM1:
    def __init__(self, training_data, validation_data, valid_align_path, training_iterations, training, alpha=-1):
        self.data = training_data
        self.validation_data = validation_data
        self.training_iterations = training_iterations
        self.valid_align_path = valid_align_path
        self.t = self.init_t()
        if training == "VB" and alpha == -1:
            print("Please reinitialise with an alpha value >= 0.")
        elif training == "VB":
            print("Initialised Variational Bayes model.")
        else:
            print("Initialised Expectation Maximisation model.")
        self.training = training
        self.alpha = alpha

    def train(self):
        """Train IBM1 by expectation maximization"""
        for s in range(0, self.training_iterations):
            ll = self.log_likelihood()
            print("Log likelihood: {}".format(ll))
            self.test()
            print("Iteration {}".format(s + 1))
            c_1 = defaultdict(Counter)
            c_2 = Counter()
            n = len(self.data)

            for k in tqdm.tqdm(range(n)):
                # extract all info for the current sentence 
                pair = self.data[k]
                e_sentence = pair[1]
                f_sentence = pair[0]

                # loop over all positions in both sentences
                for f in f_sentence:
                    sentence_prob = sum([self.t[e2][f] for e2 in e_sentence])
                    for e in e_sentence:
                        delta = self.t[e][f] / sentence_prob
                        # update the counts
                        c_1[e][f] += delta
                        c_2[e] += delta

            # after looping over the counts, re-estimate t and q
            self.update_t(c_1, c_2)

    def init_t(self):
        """Initialize the transition probabilities randomly. This is a counter object."""
        vocabulary = defaultdict(list)
        for f, e in self.data:
            for w1 in f:
                for w2 in e:
                    vocabulary[w2].append(w1)
        t = defaultdict(Counter)
        for e in vocabulary:
            words = list(set(vocabulary[e]))
            probs = np.array([1 for i in range(len(words))])
            probs = probs / sum(probs)
            for i, f in enumerate(words):
                t[e][f] = probs[i] 
        return t

    def update_t(self, c_1, c_2):
        """Update the transition probabilities.

        Args:
            c_1: counts for english and french words occurring together
            c_2: counts for english words on their own

        Returns:
            Counter object
        """
        if self.training == "VB" and self.alpha != -1:
            for all_f, all_e in self.data:
                for f in all_f:
                    for e in all_e:
                        self.t[e][f] = math.exp(digamma(c_1[e][f] + self.alpha) - digamma(c_2[e] + len(c_1[e])*self.alpha))
        else:
            for all_f, all_e in self.data:
                for f in all_f:
                    for e in all_e:
                        self.t[(f, e)] = c_1[e][f] / c_2[e]

    def log_likelihood(self):
        """Calculate log likelihood of IBM1 model.

        Args:
            data: list of aligned sentences in tuples (french, english)
            t: Counter object, transition probabilities

        Returns:
            float
        """
        log_likelihood = 0
        for all_f, all_e in self.data:
            likelihood = 1
            # Sum over all alignments using ibm1 trick
            for f in all_f:
                probs = []
                for e in all_e:
                    probs.append(self.t[e][f])
                likelihood *= sum(probs)
            likelihood = ((1 / float(1 + len(all_e)))**(len(all_f))) * likelihood
            if likelihood != 0 and not math.isnan(likelihood):
                log_likelihood += math.log(likelihood)
        return log_likelihood

    def align(self, f_sentence, e_sentence):
        alignment = []
        for i, f in enumerate(f_sentence):
            alignment_i = None
            best_score = -1
            for j, e in enumerate(e_sentence):
                score = self.t[e][f]
                if score >= best_score:
                    best_score = score
                    alignment_i = j
            alignment.append((alignment_i + 1, i + 1))
        return alignment

    def test(self):
        from random import random
        # 1. Read in gold alignments
        gold_sets = read_naacl_alignments(self.valid_align_path)

        # 2. Here you would have the predictions of your own algorithm, 
        #  for the sake of the illustration, I will cheat and make some predictions by corrupting 50% of sure gold alignments
        predictions = []
        for i, (f, e) in enumerate(self.validation_data):
            links = set(self.align(f, e))
            predictions.append(links)

        # 3. Compute AER

        # first we get an object that manages sufficient statistics 
        metric = AERSufficientStatistics()
        # then we iterate over the corpus 
        for gold, pred in zip(gold_sets, predictions):
            metric.update(sure=gold[0], probable=gold[1], predicted=pred)
        # AER
        print(metric.aer())


### 4. Train our IBM1 model with EM

In [None]:
model_em = IBM1(training_data, validation_data, 'validation/dev.wa.nonullalign', 10, "EM")
model_em.train()

### 5. Train the IBM1 model with Variational Bayes

In [None]:
model_vb = IBM1(training_data, validation_data, 'validation/dev.wa.nonullalign', 10, "VB", 0.05)
model_vb.train()

In [None]:
# Check whether the data makes sense
pprint.pprint(model.t.most_common(25))