In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import optparse, sys, os, logging
from collections import defaultdict
from itertools import islice
import time
#from align_yabing import *

In [4]:
opts_datadir, opts_fileprefix = "data", "hansards"

In [5]:
opts_french, opts_english = "fr", "en"

In [6]:
opts_num_sents = 100000

In [7]:
f_data = "%s.%s" % (os.path.join(opts_datadir, opts_fileprefix), opts_french)
e_data = "%s.%s" % (os.path.join(opts_datadir, opts_fileprefix), opts_english)

In [9]:
bitext = [[sentence.strip().split() for sentence in pair] \
          for pair in islice(zip(open(f_data,encoding="utf8"), open(e_data,encoding="utf8")), opts_num_sents)]

In [10]:
sys.stderr.write("Training with Expectation Maximization...\n")

Training with Expectation Maximization...


In [11]:
%%time
# f is the French word set
# e is the English word set
# f_count is the word count dictionary for French word set
# N is the number of sentences
f = set()
e = set()
f_count = defaultdict(int)
for pair in bitext:
    f = f.union(set(pair[0]))
    e = e.union(set(pair[1]))
    for f_i in set(pair[0]):
        if f_i in f_count.keys():
            f_count[f_i] += 1
        else:
            f_count[f_i] = 1
N = len(bitext)

* $k = 0$<br>
* Initialize $t_0$ **## Easy choice: initialize uniformly ##**<br>
* repeat <br>
    * $k$ += 1 <br>
    * Initialize all counts to zero <br>
    * for each $(\textbf{f}, \textbf{e})$ in ${\cal D}$ <br>
        * for each $f_i$ in $\textbf{f}$ <br>
            * $Z$ = 0 **## Z commonly denotes a normalization term ##** <br>
            * for each $e_j$ in $\textbf{e}$ <br>
                * $Z$ += $t_{k-1}(f_i \mid e_j)$ <br>
            * for each $e_j$ in $\textbf{e}$ <br>
                * `c` = $ t_{k-1}(f_i \mid e_j) / Z $ <br>
                * count($f_i$, $e_j$) += `c` <br>
                * count($e_j$) += `c` <br>
    * for each ($f$, $e$) in count <br>
        * Set new parameters: $t_k(f \mid e)$ =  count($f,e$) / count($e$) <br>
* until convergence **## See below for convergence tests ##** <br>

In [12]:
k = 0
# initialize theta uniformly
num_f = len(f_count)
theta = defaultdict(lambda: 1./num_f)
while k < 5:
    k += 1
    tic = time.time()
    sys.stderr.write(f"Iteration {k}.................................\n")
    e_count = defaultdict(int)
    fe_count = defaultdict(int)
    for n in range(N):
        for f_i in bitext[n][0]:
            Z = 0
            for e_j in bitext[n][1]:
                Z += theta[(f_i, e_j)]
            for e_j in bitext[n][1]:
                c = theta[(f_i, e_j)] / Z
                fe_count[(f_i, e_j)] += c
                e_count[e_j] += c
    for (f_i, e_j) in fe_count.keys():
        theta[(f_i, e_j)] = fe_count[(f_i, e_j)] / e_count[e_j]
    toc = time.time()
    sys.stderr.write(f"Iteration {k} finished. Time cost: {toc-tic}\n")

Iteration 1.................................
Iteration 2.................................
Iteration 3.................................
Iteration 4.................................
Iteration 5.................................


* for each $(\textbf{f}, \textbf{e})$ in ${\cal D}$
    * for each $f_i$ in $\textbf{f}$
        * `bestp` = 0
        * `bestj` = 0
        * for each $e_j$ in $\textbf{e}$
            * if $t(f_i \mid e_j)$ > `bestp`
                * `bestp` = $t(f_i \mid e_j)$
                * `bestj` = $j$
        * align $f_i$ to $e_{\texttt{bestj}}$

In [13]:
sys.stderr.write("Aligning...\n")

Aligning...


In [18]:
%%capture --no-stderr dice_a
for f, e in bitext:
    for i in range(len(f)):
        f_i = f[i]
        bestp = 0
        bestj = 0
        for j in range(len(e)):
            e_j = e[j]
            if theta[(f_i, e_j)] > bestp:
                bestp = theta[(f_i, e_j)]
                bestj = j
        sys.stdout.write(f"{i}-{bestj} ")
    sys.stdout.write("\n")

In [22]:
# dump the output to the local file dice.a
with open('dice.a','w',encoding=) as fh:
    fh.write(str(dice_a))

In [1]:
#%run check-alignments.py -i dice.a

In [26]:
%run score-alignments.py -n 0 -i dice.a

Precision = 0.597603
Recall = 0.774889
AER = 0.341724
