Imports

In [100]:
import xml.etree.ElementTree as ET
import spacy
import random
import string
import math

nlp_models = {
    'en': spacy.load('en_core_web_md'),
    'it': spacy.load('it_core_news_md')
}

from collections import defaultdict
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tree import Tree
from nltk.translate import bleu_score as bleu
from nltk.translate import IBMModel1, AlignedSent
from spacy import displacy
from tqdm.notebook import tqdm
from zipfile import ZipFile
from operator import itemgetter

# NLP Final Project - Neural Machine Translation

## Exploration of Machine Translation Techniques using Movie Subtitles dataset

Arnaud Ruymaekers, S5298338

---

Description: 

I would like to explore developping 3 different techniques to perform Machine Translation. 
I would like to implement and compare implementations of a Statistical, Rule-Based and Neural Machine Translation.
I will attempt to implement these techniques from scratch (not using libraries to do the whole thing) to understand how they work on a deeper level.
I plan to implement this in python and to use as dataset sentence correspondances from movies subtitles EN <-> IT coming from opensubtitles.org .

Feedback:

If you will develop 3 different techniques, the project will be for sure hard. As a B-plan, you might downgrade to developing 2 techniques only, 
to make sure to stay in about 7 to 10 days of work

---

### Introduction (TODO)



## Datasets Prep

In [2]:
line_count_total = 35_216_229
file_name = 'OpenSubtitles.en-it.'
languages = ['en', 'it']

### Text Subtitles

In [3]:
def extract_file(file_name, lang='en', line_count=None, from_line=0, tokenize=True) -> dict:
    
    if line_count is None:
        line_count = line_count_total
        
    assert (from_line+line_count <= line_count_total), f'line_count + from_line should be under {line_count_total} (it is currently {line_count+from_line})'
    
    file_lines = []
    
    with ZipFile('en-it.txt.zip') as zf:
        with zf.open(file_name + lang, 'r') as f:

            for i, line in tqdm(enumerate(f), total=from_line+line_count, desc=f'Reading {lang.upper()} language file'):
                if i < from_line:
                    continue
                elif i < from_line+line_count:
                    decoded_line = line.decode("utf-8").replace('\n', '')
                    file_lines.append(word_tokenize(decoded_line) if tokenize else decoded_line)
                else:
                    break

    return file_lines

In [4]:
# Extracting 100k sentences for now
sentences = {}
for lang in languages:
    sentences[lang] = extract_file(file_name, lang, 100_000, tokenize=False)

Reading EN language file:   0%|          | 0/100000 [00:00<?, ?it/s]

Reading IT language file:   0%|          | 0/100000 [00:00<?, ?it/s]

In [5]:
# Printing some samples
for i in range(5):
    print(f'Sample {i}:')
    print('\t' + sentences['en'][i])
    print('\t\t=> ' + sentences['it'][i])

Sample 0:
	Permaculture is a design science based on three simple ethics:
		=> La permacultura è un metodo di progettazione basato su tre semplici principi etici:
Sample 1:
	care for the earth
		=> cura della terra
Sample 2:
	care for people
		=> cura delle persone
Sample 3:
	share the surplus
		=> Condividi il superfluo
Sample 4:
	Permaculture also has core principles They guide us in creating sustainable abundance
		=> La permacultura ha anche principi cardine le linee guida per la creazione di abbondanza sostenibile


### Tokenization

In [6]:
def tokenize_sentences(sentences, lang='en'):
    return [word_tokenize(sentence) for sentence in tqdm(sentences[lang], desc=f'Tokenizing {lang.upper()} doc')]

In [7]:
tok_sentences = {}
for lang in languages:
    tok_sentences[lang] = tokenize_sentences(sentences, lang)

Tokenizing EN doc:   0%|          | 0/100000 [00:00<?, ?it/s]

Tokenizing IT doc:   0%|          | 0/100000 [00:00<?, ?it/s]

In [8]:
# Printing some samples
for i in range(5):
    print(f'Sample {i}:')
    print('\t[' + ', '.join(tok_sentences['en'][i]) + ']')
    print('\t\t=> [' + ', '.join(tok_sentences['it'][i]) + ']')

Sample 0:
	[Permaculture, is, a, design, science, based, on, three, simple, ethics, :]
		=> [La, permacultura, è, un, metodo, di, progettazione, basato, su, tre, semplici, principi, etici, :]
Sample 1:
	[care, for, the, earth]
		=> [cura, della, terra]
Sample 2:
	[care, for, people]
		=> [cura, delle, persone]
Sample 3:
	[share, the, surplus]
		=> [Condividi, il, superfluo]
Sample 4:
	[Permaculture, also, has, core, principles, They, guide, us, in, creating, sustainable, abundance]
		=> [La, permacultura, ha, anche, principi, cardine, le, linee, guida, per, la, creazione, di, abbondanza, sostenibile]


### Vocabulary Extraction

In [9]:
def extract_vocab(sentences, lang='en'):
    vocab = set()
    for sentence in tqdm(sentences[lang], desc=f'Vocab extraction for {lang.upper()}'):
        vocab |= set(sentence)
    return list(vocab)

In [10]:
vocabs = {}
for lang in languages:
    vocabs[lang] = extract_vocab(tok_sentences, lang)
    print(f'Vocab size: {len(vocabs[lang])}\n')

Vocab extraction for EN:   0%|          | 0/100000 [00:00<?, ?it/s]

Vocab size: 32960



Vocab extraction for IT:   0%|          | 0/100000 [00:00<?, ?it/s]

Vocab size: 45096



## Model definition

## Sources

DS:
- https://opus.nlpl.eu/OpenSubtitles.php
- http://www.opensubtitles.org/

General:
- https://machinetranslate.org/
- https://towardsdatascience.com/machine-translation-b0f0dbcef47c
- https://towardsdatascience.com/data-preprocessing-for-machine-translation-fcbedef0e26a

Evalutation:
- https://towardsdatascience.com/bleu-bilingual-evaluation-understudy-2b4eab9bcfd1

Neural model:
- https://github.com/tensorflow/nmt#training--how-to-build-our-first-nmt-system
- https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html