Imports

In [1]:
import random
import string
import math
import re

from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.translate import bleu_score as bleu
from nltk.translate import IBMModel1, AlignedSent
from tqdm.notebook import tqdm
from zipfile import ZipFile

In [2]:
def numberToBase(n, b, pos_count=1):
    digits = []
    while n or len(digits) < pos_count:
        digits.append(int(n % b))
        n //= b
    return digits[::-1]

# NLP Final Project

## Exploration of Machine Translation Techniques using Movie Subtitles dataset

Arnaud Ruymaekers, S5298338

---

Description: 

I would like to explore developping 3 different techniques to perform Machine Translation. 
I would like to implement and compare implementations of a Statistical, Rule-Based and Neural Machine Translation.
I will attempt to implement these techniques from scratch (not using libraries to do the whole thing) to understand how they work on a deeper level.
I plan to implement this in python and to use as dataset sentence correspondances from movies subtitles EN <-> IT coming from opensubtitles.org .

Feedback:

If you will develop 3 different techniques, the project will be for sure hard. As a B-plan, you might downgrade to developing 2 techniques only, 
to make sure to stay in about 7 to 10 days of work

---

### Introduction (TODO)



## Datasets Prep

In [3]:
line_count_total = 35_216_229
file_name = 'OpenSubtitles.en-it.'
languages = ['en', 'it']

### Text Subtitles

In [4]:
def extract_file(languages=['en', 'it'], line_count=None, from_line=0, tokenize=True) -> dict:
        
    if line_count is None:
        line_count = line_count_total
        
    assert (from_line+line_count <= line_count_total), f'line_count + from_line should be under {line_count_total} (it is currently {line_count+from_line})'
    
    file_lines = {}
    
    with ZipFile('en-it.txt.zip') as zf:
        
        for lang in languages:
            lines = []
            with zf.open('OpenSubtitles.en-it.' + lang, 'r') as f:

                for i, line in tqdm(enumerate(f), total=from_line+line_count, desc=f'Reading {lang.upper()} language file'):
                    if i < from_line:
                        continue
                    elif i < from_line+line_count:
                        decoded_line = line.decode("utf-8").replace('\n', '')
                        lines.append(word_tokenize(decoded_line) if tokenize else decoded_line)
                    else:
                        break
                        
            file_lines[lang] = lines

    return file_lines

In [5]:
# Extracting 100k sentences for now
sentences = extract_file(languages, 100_000, tokenize=False)

Reading EN language file:   0%|          | 0/100000 [00:00<?, ?it/s]

Reading IT language file:   0%|          | 0/100000 [00:00<?, ?it/s]

In [6]:
# Printing some samples
for i in range(5):
    print(f'Sample {i+1}:')
    print('\t' + sentences['en'][i])
    print('\t\t=> ' + sentences['it'][i])

Sample 1:
	Permaculture is a design science based on three simple ethics:
		=> La permacultura è un metodo di progettazione basato su tre semplici principi etici:
Sample 2:
	care for the earth
		=> cura della terra
Sample 3:
	care for people
		=> cura delle persone
Sample 4:
	share the surplus
		=> Condividi il superfluo
Sample 5:
	Permaculture also has core principles They guide us in creating sustainable abundance
		=> La permacultura ha anche principi cardine le linee guida per la creazione di abbondanza sostenibile


Sample 1:
	Permaculture is a design science based on three simple ethics:
		=> La permacultura è un metodo di progettazione basato su tre semplici principi etici:
Sample 2:
	care for the earth
		=> cura della terra
Sample 3:
	care for people
		=> cura delle persone
Sample 4:
	share the surplus
		=> Condividi il superfluo
Sample 5:
	Permaculture also has core principles They guide us in creating sustainable abundance
		=> La permacultura ha anche principi cardine le linee guida per la creazione di abbondanza sostenibile

### Tokenization

In [7]:
def tokenize_sentences(sentences):
    tok_sentences = {}
    for lang, lang_sentences in sentences.items():
        tok_sentences[lang] = [word_tokenize(sentence) for sentence in tqdm(lang_sentences, desc=f'Tokenizing {lang.upper()} doc')]
    return tok_sentences

In [8]:
raw_tok_sentences = tokenize_sentences(sentences)

Tokenizing EN doc:   0%|          | 0/100000 [00:00<?, ?it/s]

Tokenizing IT doc:   0%|          | 0/100000 [00:00<?, ?it/s]

In [9]:
# Printing some samples
for i in range(5):
    print(f'Sample {i+1}:')
    print('\t[' + ', '.join(raw_tok_sentences['en'][i]) + ']')
    print('\t\t=> [' + ', '.join(raw_tok_sentences['it'][i]) + ']')

Sample 1:
	[Permaculture, is, a, design, science, based, on, three, simple, ethics, :]
		=> [La, permacultura, è, un, metodo, di, progettazione, basato, su, tre, semplici, principi, etici, :]
Sample 2:
	[care, for, the, earth]
		=> [cura, della, terra]
Sample 3:
	[care, for, people]
		=> [cura, delle, persone]
Sample 4:
	[share, the, surplus]
		=> [Condividi, il, superfluo]
Sample 5:
	[Permaculture, also, has, core, principles, They, guide, us, in, creating, sustainable, abundance]
		=> [La, permacultura, ha, anche, principi, cardine, le, linee, guida, per, la, creazione, di, abbondanza, sostenibile]


### Filtering tokens

In [10]:
for en_sent, it_sent in zip(*[raw_tok_sentences[lang] for lang in languages]):
    print(en_sent)
    print(it_sent)
    break

['Permaculture', 'is', 'a', 'design', 'science', 'based', 'on', 'three', 'simple', 'ethics', ':']
['La', 'permacultura', 'è', 'un', 'metodo', 'di', 'progettazione', 'basato', 'su', 'tre', 'semplici', 'principi', 'etici', ':']


In [11]:
def clean_sentence(sentence):    
    cleaned_sentence = []
    for word in sentence:
        lower_word = word.lower()
        punc_less_word = re.sub(r'[^\w\s]', '', lower_word)
        
        if len(punc_less_word) > 0:
            cleaned_sentence.append(punc_less_word)
            
    return cleaned_sentence


def clean_all_sentences(sentences):
    tok_sentences = {'en': [], 'it':[]}
    for en_sent, it_sent in tqdm(zip(*[raw_tok_sentences[lang] for lang in languages]), total=len(raw_tok_sentences['en']), desc='Filtering sentences'):
        filt_en_sent = clean_sentence(en_sent)
        filt_it_sent = clean_sentence(it_sent)
        
        if len(filt_en_sent) > 0 and len(filt_it_sent) > 0:
            tok_sentences['en'].append(filt_en_sent)
            tok_sentences['it'].append(filt_it_sent)
            
    return tok_sentences

In [12]:
tok_sentences = clean_all_sentences(raw_tok_sentences)

Filtering sentences:   0%|          | 0/100000 [00:00<?, ?it/s]

### Vocabulary Extraction (UNUSED)

In [13]:
def extract_vocab(sentences):
    vocabs = {'en':set(), 'it':set()}
    
    for lang, lang_sentences in sentences.items():
        for sentence in tqdm(lang_sentences, desc=f'Vocab extraction for {lang.upper()}'):
            vocabs[lang] |= set(sentence)
        print(f'Vocab size: {len(vocabs[lang])}\n')
        
    return vocabs

In [14]:
vocabs = extract_vocab(tok_sentences)

Vocab extraction for EN:   0%|          | 0/99866 [00:00<?, ?it/s]

Vocab size: 25652



Vocab extraction for IT:   0%|          | 0/99866 [00:00<?, ?it/s]

Vocab size: 37470



### Filtering to short sentences (UNUSED)

In [15]:
def filter_sent_to_len(sentences, l=15):
    short_sentences = {'en': [], 'it': []}

    n = len(sentences['en'])
    new_n = 0
    
    for i in tqdm(range(n), desc='Filtering to short sentences'):
        en_sent = sentences['en'][i]
        it_sent = sentences['it'][i]

        if len(en_sent) <= l and len(it_sent) <= l:
            short_sentences['en'].append(en_sent)
            short_sentences['it'].append(it_sent)
            new_n += 1
            
    print(f'There are {new_n} sentences left with length {l} or less')
    
    return short_sentences

In [16]:
short_tok_sentences = filter_sent_to_len(tok_sentences, 10)

Filtering to short sentences:   0%|          | 0/99866 [00:00<?, ?it/s]

There are 78513 sentences left with length 10 or less


## Language model

### Retrieving N-gram counts

In [17]:
def build_ngram_counts(sentences, n_s=[1,2,3,4]):
    ngram_counts = {}
    
    for lang, lang_sentences in sentences.items():
        lang_ngram_counts = defaultdict(lambda: defaultdict(lambda: 0))
        for sentence in tqdm(lang_sentences, desc=f'Retrieving n-gram counts for {lang.upper()} doc'):
            sentence = (['<start>'] * (max(n_s) - 1)) + sentence + ['<end>']
            for n in n_s:
                for i in range(len(sentence)-n+1):
                    ngram = tuple(sentence[i:i+n])
                    lang_ngram_counts[n][ngram] += 1
                    
        ngram_counts[lang] = lang_ngram_counts
        
    return ngram_counts

In [18]:
ngram_counts =  build_ngram_counts(tok_sentences)

Retrieving n-gram counts for EN doc:   0%|          | 0/99866 [00:00<?, ?it/s]

Retrieving n-gram counts for IT doc:   0%|          | 0/99866 [00:00<?, ?it/s]

### Computing N-gram probabilities

In [19]:
def build_ngram_probs(ngram_counts, lang='en'):
    probs = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for n, counts in tqdm(ngram_counts.items(), desc=f'Building n-gram probabilities for {lang.upper()}'):
        
        if n > 1:
            for ngram, count in tqdm(counts.items(), leave=False):
                if ngram_counts[n-1][tuple(ngram[:-1])] == 0.0:
                    print(ngram[:-1])
                probs[tuple(ngram[:-1])][ngram[-1]] = count / ngram_counts[n-1][tuple(ngram[:-1])]
                
        else:
            total = sum(list(counts.values()))
            for ngram, count in tqdm(counts.items(), leave=False):
                probs[tuple()][ngram[0]] = count / total
            
    return probs

In [20]:
ngram_probs = {}
for lang in languages:
    ngram_probs[lang] = build_ngram_probs(ngram_counts[lang], lang)

Building n-gram probabilities for EN:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/25654 [00:00<?, ?it/s]

  0%|          | 0/206287 [00:00<?, ?it/s]

  0%|          | 0/425218 [00:00<?, ?it/s]

  0%|          | 0/532251 [00:00<?, ?it/s]

Building n-gram probabilities for IT:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/37472 [00:00<?, ?it/s]

  0%|          | 0/242613 [00:00<?, ?it/s]

  0%|          | 0/436752 [00:00<?, ?it/s]

  0%|          | 0/506352 [00:00<?, ?it/s]

### Language Model classes

In [21]:
class LanguageModel:
    def sequence_probability(self, sequence):
        pass
    
    def next_word_from_sequence(self, sequence):
        pass

In [22]:
class UnigramModel(LanguageModel):
    def __init__(self, counts):
        self.probabilities = defaultdict(lambda: 0.0)
        
        total = sum(list(counts[1].values()))
        for ngram, count in tqdm(counts[1].items(), desc='Generating unigrams probabilities'):
            self.probabilities[ngram[0]] = count / total
            
    def sequence_probability(self, sequence):
        if isinstance(sequence, list):
            sequence = sequence[-1]
        return self.probabilities[sequence]
    
    def next_word_from_sequence(self, sequence):
        return self.probabilities
    
unigram_models = {
    'en': UnigramModel(ngram_counts['en']),
    'it': UnigramModel(ngram_counts['it'])
}

Generating unigrams probabilities:   0%|          | 0/25654 [00:00<?, ?it/s]

Generating unigrams probabilities:   0%|          | 0/37472 [00:00<?, ?it/s]

In [23]:
class NgramModel(LanguageModel):
    def __init__(self, counts, n):
        self.probabilities = defaultdict(lambda: defaultdict(lambda: 0.0))
        self.n = n

        for ngram, count in tqdm(counts[n].items(), desc=f'Generating {n}-gram probabilities'):
            self.probabilities[tuple(ngram[:-1])][ngram[-1]] = count / counts[n-1][tuple(ngram[:-1])]
            
    def sequence_probability(self, sequence:list):
        return self.probabilities[tuple(sequence[-n:-1])][-1]
    
    def next_word_from_sequence(self, sequence):
        return self.probabilities[-(n-1):]
    
# 2-gram
bigram_models = {
    'en': NgramModel(ngram_counts['en'], 2),
    'it': NgramModel(ngram_counts['it'], 2)
}

# 3-gram
trigram_models = {
    'en': NgramModel(ngram_counts['en'], 3),
    'it': NgramModel(ngram_counts['it'], 3)
}

# 4-gram
quadrigram_models = {
    'en': NgramModel(ngram_counts['en'], 4),
    'it': NgramModel(ngram_counts['it'], 4)
}

Generating 2-gram probabilities:   0%|          | 0/206287 [00:00<?, ?it/s]

Generating 2-gram probabilities:   0%|          | 0/242613 [00:00<?, ?it/s]

Generating 3-gram probabilities:   0%|          | 0/425218 [00:00<?, ?it/s]

Generating 3-gram probabilities:   0%|          | 0/436752 [00:00<?, ?it/s]

Generating 4-gram probabilities:   0%|          | 0/532251 [00:00<?, ?it/s]

Generating 4-gram probabilities:   0%|          | 0/506352 [00:00<?, ?it/s]

In [24]:
class BackoffModel(LanguageModel):
    def __init__(self, counts, n, ngram_models, discount=0.6):
        self.counts = counts
        self.n = n
        self.ngram_models = ngram_models
    
    def sequence_probability(self, sequence:list, investigating_n=None):
        if not investigating_n:
            investigating_n = self.n
        
        if self.counts[investigating_n][sequence[-n:]] > 0 or len(sequence) < 2:
            return ngram_models[investigating_n-1].sequence_probability(sequence)
        else:
            return self.sequence_probability(sequence[-(investigating_n-1):], investigating_n=(investigating_n-1))
    
    def next_word_from_sequence(self, sequence):
        return self.ngram_models[n-1](sequence)

backoff_models = {}

for lang in languages:
    ngram_models = [unigram_models[lang], bigram_models[lang], trigram_models[lang]]
    backoff_models[lang] = BackoffModel(ngram_counts, 3, ngram_models)

In [None]:
# def find_lambdas(sentences, ngrams, max_ngram, lang='en', full_print=False):
#     assert max_ngram > 1, 'max_ngram should be at least 2'
#     assert max_ngram <= 10, 'max_gram cant be above 10' 
    
#     l_options = [(i+1)/10 for i in range(10-(max_ngram-1))]
#     combinations = len(l_options)**(max_ngram-1)
    
#     best_lambdas = None
#     best_prob = float('-inf')
    
#     for comb in tqdm(range(combinations), desc=f'Checking lambda sets for {lang.upper()}'):
#         # Compute lambda option
#         indices = numberToBase(comb, len(l_options), (max_ngram-1))
        
#         lambdas = [l_options[ind] for ind in indices]
#         last_l = round((1.0 - sum(lambdas)), 1)
        
#         if last_l < 0.1:
#             continue
        
#         lambdas += [last_l]
        
#         # print(f'{lambdas} = {round(sum(lambdas), 1)}')
        
#         # Computing sentence set probabilities
#         total_prob = 0.0
#         prob_count = 0
    
#         for sent in tqdm(sentences, desc=f'Lamdba set {lambdas}', leave=full_print):
#             extended_sent = (['<start>'] * (max_ngram-1)) + sent + ['<end>']
            
#             sent_prob = 1
#             for i in range((max_ngram-1), len(sent) + (max_ngram-1) + 1):
#                 sent_prob *= sum([l*ngrams[tuple(extended_sent[i-j:i])][extended_sent[i]] for j, l in enumerate(lambdas)])
                
#             total_prob += sent_prob
#             prob_count += 1
        
#         # Deciding whether to lambda set is better than previous best
#         average_prob = total_prob / prob_count
        
#         if average_prob > best_prob:
#             best_prob = average_prob
#             best_lambdas = lambdas
                
#         if full_print:
#             print(average_prob)
            
#     return best_lambdas

# ngram_lambdas = {}

# for lang in languages:
#     lambdas = find_lambdas(tok_sentences['en'], ngram_probs['en'], 3)
#     print(f'Best lambdas found: {lambdas}')
#     ngram_lambdas[lang] = lambdas
    
# ngram_lambdas

## Translation Model

### IBM Model 1

- estimating t-values

In [42]:
def ibm_model1(tok_sentences, source_lang='en', target_lang='it', max_epochs=20, t=None, show_sub_progress=True):
    source_sentences = tok_sentences[source_lang]
    target_sentences = tok_sentences[target_lang]

    n = len(source_sentences)
    
    # Translation table
    if not t:
        t = defaultdict(lambda: defaultdict(lambda: random.random()))
    else:
        t = t.copy()

    for epoch in tqdm(range(max_epochs), desc='Model training'):

        counts_target_source = defaultdict(lambda: 0)
        counts_target = defaultdict(lambda: 0)


        for k in tqdm(range(n), desc=f'Epoch {epoch+1}', leave=show_sub_progress):
            source_sent = source_sentences[k]
            target_sent = [None] + target_sentences[k]

            # Loop through source sentence words
            for source_word in source_sent:

                target_t_sum = sum([(t[source_word][target_word]) for target_word in target_sent])

                # Loop through target sentence words
                for target_word in target_sent:

                    delta = t[source_word][target_word] / target_t_sum

                    # Counts updates
                    counts_target_source[(target_word, source_word)] += delta
                    counts_target[target_word] += delta

        # Update of ts
        diff_sum = 0
        diff_count = 0
        for (target_word, source_word) in counts_target_source:
            new_t = counts_target_source[(target_word, source_word)] / counts_target[target_word]

            diff_sum += abs(new_t - t[source_word][target_word])
            diff_count += 1

            t[source_word][target_word] = new_t

        # Computing avg_diff of the t values and decision of convergence
        avg_diff = diff_sum/diff_count
        
        if show_sub_progress:
            print(f'\t - Average difference in t-values: {avg_diff}\n')

        if avg_diff < 0.0001:
            print('Early exit because change between average t-table value changes is lower than 10-4')
            break
    
    return t

In [43]:
t = ibm_model1(tok_sentences, 'en', 'it', 10, show_sub_progress=True)

Model training:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.48281345017381705



Epoch 2:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.007517830520437741



Epoch 3:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.005363587539161923



Epoch 4:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.0031112834255788853



Epoch 5:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.0018187764202964743



Epoch 6:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.001143428512975153



Epoch 7:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.0007715040933502066



Epoch 8:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.0005512419148789678



Epoch 9:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.0004113286976775883



Epoch 10:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in t-values: 0.0003171680192572844



In [44]:
# Printing sample values
source_word = list(t.keys())[0]
for i, target_word in enumerate(t[source_word]):
    print(f'{source_word} > {target_word} - {t[source_word][target_word]}')
    if i >= 10:
        break

permaculture > None - 5.069516994067443e-38
permaculture > la - 2.127513906465088e-32
permaculture > permacultura - 0.8721630570216184
permaculture > è - 2.466527574822481e-31
permaculture > un - 1.3480397932470805e-33
permaculture > metodo - 4.619957815484537e-11
permaculture > di - 3.392237325314037e-32
permaculture > progettazione - 4.834074748069685e-05
permaculture > basato - 2.5293415424010165e-13
permaculture > su - 5.6962096646072605e-27
permaculture > tre - 2.4920819342961294e-26


#### NLTK implementation comparisson

In [46]:
# bitext = []
# for i, source_sent in enumerate(tok_sentences['en']):
#     bitext.append(AlignedSent(source_sent, tok_sentences['it'][i]))

# nltk_ibm1 = IBMModel1(bitext, 10)

In [47]:
# # Printing sample values
# source_word = 'Permaculture'
# for i, target_word in enumerate(nltk_ibm1.translation_table[source_word]):
#     print(f'{source_word} > {target_word} - {nltk_ibm1.translation_table[source_word][target_word]}')
#     if i >= 10:
#         break

### IBM Model 2

- Estimating q-values

In [48]:
def ibm_model2(tok_sentences, source_lang='en', target_lang='it', max_epochs=20, t=None, q=None):
    
    # Source // corpus
    source_sentences = tok_sentences[source_lang]
    target_sentences = tok_sentences[target_lang]
    
    n = len(source_sentences)
    
    # Translation table
    if not t:
        t = defaultdict(lambda: defaultdict(lambda: random.random()))
    else:
        t = t.copy()
    
    # Alignment table
    if not q:
        q = defaultdict(lambda: defaultdict(lambda: random.random()))
    else:
        q = q.copy()
    
    # Do the iterations
    for epoch in range(max_epochs):

        counts_target_source = defaultdict(lambda: 0)
        counts_target = defaultdict(lambda: 0)
        counts_j_i_l_m = defaultdict(lambda: 0)
        counts_i_l_m = defaultdict(lambda: 0)

        # Iterate the corpus
        for k in tqdm(range(n), desc=f'Epoch {epoch+1}'):
            source_sent = [None] + source_sentences[k]
            target_sent = ['UNUSED'] + target_sentences[k]

            m = len(source_sent) - 1
            l = len(target_sent) - 1

            # Loop through source sentence words
            for i in range(1, m+1):
                source_word = source_sent[i]

                target_t_q_sum = sum([(q[(i,l,m)][j] * t[source_word][target_sent[j]]) for j in range(0, l+1)])

                # Loop through target sentence words
                for j in range(0, l+1):
                    target_word = target_sent[j]

                    delta = (q[(i,l,m)][j] * t[source_word][target_word]) / target_t_q_sum

                    # Counts updates
                    counts_target_source[(target_word, source_word)] += delta
                    counts_target[target_word] += delta
                    counts_j_i_l_m[(j,i,l,m)] += delta
                    counts_i_l_m[(i,l,m)] += delta

        # Update of ts and qs
        for (target_word, source_word) in counts_target_source:
            t[source_word][target_word] = counts_target_source[(target_word, source_word)] / counts_target[target_word]

        diff_sum = 0
        diff_count = 0
        for (j,i,l,m) in counts_j_i_l_m:
            new_q = counts_j_i_l_m[(j,i,l,m)] / counts_i_l_m[(i,l,m)]

            diff_sum += abs(new_q - q[(i,l,m)][j])
            diff_count += 1

            q[(i,l,m)][j] = new_q

        # Computing avg_diff of the t values and decision of convergence
        avg_diff = diff_sum/diff_count
        print(f'\t - Average difference in q-values: {avg_diff}\n')

        if avg_diff < 0.0001:
            break
            
    return (t,q)

In [49]:
(_,q) = ibm_model2(tok_sentences, 'en', 'it', 10, t)

Epoch 1:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.4690395302357117



Epoch 2:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.03201975331755284



Epoch 3:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.009113779279865033



Epoch 4:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.004385103305592243



Epoch 5:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.0026033528885994876



Epoch 6:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.0017267162205747183



Epoch 7:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.0012231865199606863



Epoch 8:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.0009065247439692799



Epoch 9:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.0006982660139370228



Epoch 10:   0%|          | 0/99866 [00:00<?, ?it/s]

	 - Average difference in q-values: 0.000562269209468171



In [50]:
# Printing sample values
key_0 = list(q.keys())[0]
for i, j in enumerate(q[key_0]):
    print(f'{key_0} > {j} - {q[key_0][j]}')
    if i >= 10:
        break

(1, 13, 10) > 0 - 0.0002442271908711633
(1, 13, 10) > 1 - 0.6384164863664822
(1, 13, 10) > 2 - 0.12619653962393293
(1, 13, 10) > 3 - 0.1054804613552266
(1, 13, 10) > 4 - 0.0010699306558235504
(1, 13, 10) > 5 - 0.05090313257603431
(1, 13, 10) > 6 - 0.010280483800826018
(1, 13, 10) > 7 - 0.0001481743321103478
(1, 13, 10) > 8 - 4.3147107098094865e-07
(1, 13, 10) > 9 - 0.004104445922785928
(1, 13, 10) > 10 - 0.04773139541152379


## Decoding

In [51]:
class Hypothesis:
    def __init__(self,
                 translated_sequence=['<start>','<start>'],
                 probability=1.0,
                 already_translated=[]
                ):
        self.translated_sequence = translated_sequence
        self.probability = probability
        self.already_translated = already_translated
    
    def add_word(self, word, word_prob, from_index):
        return Hypothesis(
                 translated_sequence = self.translated_sequence + [word],
                 probability = self.probability * word_prob,
                 already_translated = self.already_translated + [from_index]
        )
    
    def __str__(self):
        return f'{self.probability}:' + ' '.join(self.translated_sequence[2:])

    
def get_ngram_prob(ngram_probs, sequence, discount=0.6, min_prob=0.0):
    prob = ngram_probs[tuple(sequence[:-1])][sequence[-1]]
    if prob > min_prob or len(sequence) < 2:
        return prob
    else:
        return discount * get_ngram_prob(ngram_probs, sequence[1:], discount, min_prob)


def beam_translate(sent, t, q, ngram_probs, beta, min_prob=0.0, max_options=None):
    
    source_sent_length = len(sent)
    
    # Retrieving all translation probabilities for the words in the source sequence
    translation_probs = [t[word] for word in sent]
    translation_probs = [dict(filter(lambda x: x[1]>min_prob, translation.items())) for translation in translation_probs]
    if max_options:
        translation_probs = [dict(sorted(translation.items(), key = lambda item: item[1], reverse = True)[:max_options]) for translation in translation_probs]
    
    # Retrieving distorition probs
    distorted_translation_prob = [[{ word: (word_prob * q[(i+1, source_sent_length, source_sent_length)][j+1])
                                    for word, word_prob in translation_dict.items()} 
                                   for i, translation_dict in enumerate(translation_probs)] 
                                  for j in range(source_sent_length)]
    
    # Defining the hypotheses list with an initial empty hypothesis
    current_hypotheses = [Hypothesis()]
    
    for j in range(source_sent_length):
        
        # translation_probs = distorted_translation_prob[j]
        new_hypotheses = []
        
        for hypothesis in current_hypotheses:
            for i, translation_dict in enumerate(distorted_translation_prob[j]):
                if i in hypothesis.already_translated:
                    continue
                
                for word, translation_prob in translation_dict.items():
                    
                    # language model probabilities (tri-gram) 
                    lang_prob = get_ngram_prob(ngram_probs, hypothesis.translated_sequence[-2:] + [word], discount=0.6, min_prob=min_prob)

                    # Combine probabilities and generate new hypothesis
                    new_hypothesis = hypothesis.add_word(
                        word=word,
                        word_prob=(translation_prob * lang_prob),
                        from_index=i
                    )
                    new_hypotheses.append(new_hypothesis)
        
        # Sorting options by probability
        new_hypotheses = sorted(new_hypotheses, key=lambda hyp: hyp.probability, reverse=True)

        # Taking "beta" best translations
        current_hypotheses = new_hypotheses[:beta]
                
    return current_hypotheses

In [52]:
# Sample output
hyp = beam_translate(tok_sentences['en'][0], t, q, ngram_probs['it'], beta=10, min_prob=1.0e-24, max_options=20)

In [53]:
print(' '.join(tok_sentences['en'][0]))
print('\t-> ' + ' '.join(hyp[0].translated_sequence[2:]))
print('\t-> (ref) ' + ' '.join(tok_sentences['it'][0]))

permaculture is a design science based on three simple ethics
	-> su un disegno scienza tre semplici basato permacultura equivale etici
	-> (ref) la permacultura è un metodo di progettazione basato su tre semplici principi etici


## Evaluation

In [54]:
def evaluate_translation(ref_translations:list, translations:list, max_n=4, display=True):
    assert len(ref_translations) == len(translations), 'There should be as many reference translations as translations'
    
    weights_set = [tuple([1/(n) for i in range(n)]) for n in range(1,max_n+1)]
    
    scores = [bleu.corpus_bleu([[ref] for ref in ref_translations], translations, weights=weights) for weights in weights_set]

    if display:
        print('Translation scores:')
        for i in range(max_n):
            print(f'\t- BLEU-{i+1}: {scores[i]}')
    
    return scores

In [55]:
source_sentences = tok_sentences['en'][:2000]
ref_translated_sentences = tok_sentences['it'][:2000]

translated_sentences = []

for sent in tqdm(source_sentences, desc='Translation'):
    hypotheses = beam_translate(sent, t, q, ngram_probs['it'], beta=5, min_prob=1.0e-7, max_options=20)
    
    best_hypothesis_sentence = hypotheses[0].translated_sequence[2:]
    
    translated_sentences.append(best_hypothesis_sentence)

Translation:   0%|          | 0/2000 [00:00<?, ?it/s]

In [125]:
evaluate_translation(ref_translated_sentences, translated_sentences, 4)

[(1.0,), (0.5, 0.5), (0.3333333333333333, 0.3333333333333333, 0.3333333333333333), (0.25, 0.25, 0.25, 0.25)]


[0.2660981092159196,
 0.14394392232122913,
 0.08641068038352694,
 0.053198782628144106]

## Full pipeline (TODO)

In [25]:
corpus_generators = {}

for lang in languages:
    corpus_generators[lang] = (row.split(' ') for row in open(f'{lang}_corpus_processed.txt', 'r'))

## Sources

DS:
- https://opus.nlpl.eu/OpenSubtitles.php
- http://www.opensubtitles.org/

General:
- https://machinetranslate.org/
- https://towardsdatascience.com/machine-translation-b0f0dbcef47c
- https://towardsdatascience.com/data-preprocessing-for-machine-translation-fcbedef0e26a

Evalutation:
- https://towardsdatascience.com/bleu-bilingual-evaluation-understudy-2b4eab9bcfd1

Statistical model: 
- http://www.cs.columbia.edu/~mcollins/courses/nlp2011/notes/ibm12.pdf
- https://en.wikipedia.org/wiki/IBM_alignment_models
- https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1162/handouts/
- https://www.youtube.com/watch?v=DuYkqCQEbpo&list=PLQrCiUDqDLG0lQX54o9jB4phJ-SLI6ZBQ
- https://www.nltk.org/api/nltk.translate.html
- https://web.stanford.edu/~jurafsky/slp3/3.pdf
- https://medium.com/mti-technology/n-gram-language-models-b125b9b62e58