### English-French Neural Machine Translation

In [205]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [267]:
src_language = 'eng'
target_language = 'fra'

In [208]:
# Read the file and split into lines
with open('data/%s-%s.txt' % (src_language, target_language), encoding='utf-8') as file:
    text_data = file.read().splitlines()
print(text_data[:5])

['Go.\tVa !', 'Run!\tCours\u202f!', 'Run!\tCourez\u202f!', 'Wow!\tÇa alors\u202f!', 'Fire!\tAu feu !']


The text is in Unicode. So, we will take the following preprocessing steps:
1. Turn Unicode characters to ASCII
2. lowercase
3. Trim most punctuation

In [224]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(sent):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sent)
        if unicodedata.category(c) != 'Mn'
    )

def preprocess_string(sent):
    "lowercase, unicode_to_ascii, trim, and remove non-letter characters"
    sent = sent.lower().strip()
    sent = unicode_to_ascii(sent)
    # The backreference \1 (backslash one) references the first capturing group. 
    # space followed by \1 matches the exact same text that was matched by the first capturing group [.!?].
    sent = re.sub(r"([.!?])", r" \1", sent)
    # replace character which are not from this set (a-zA-Z.!?) by single space character
    sent = re.sub(r"[^a-zA-Z.!?]+", r" ", sent)
    return sent.strip()

In [225]:
preprocess_string(text_data[3])
print(text_data[3])
print(text_data[3].split('\t')[::-1])

Wow!	Ça alors !
['Ça alors\u202f!', 'Wow!']


In [226]:
def load_data(file_name, reverse=False):
    print("Reading text file...")

    # Read the file and split into lines
    with open('data/%s' % (file_name), encoding='utf-8') as file:
        lines = file.read().splitlines()

    # Split every line into pairs [src_lang, target_lang] and preprocess
    pairs = [[preprocess_string(s) for s in line.split('\t')] for line in lines]

    if reverse:
        pairs = [p[::-1] for p in pairs]
        
    return pairs

In [212]:
pairs = load_data('eng-fra.txt', reverse=False)

Reading text file...


In [269]:
SOS_token = 0
EOS_token = 1
UNK = 2
PAD = 3
MAX_SEQ_LEN = 12

class Language:
    def __init__(self, lang_name, src=True):
        self.lang_name = lang_name
        self.word_to_index = {}
        self.index_to_word = {0: "SOS", 1: "EOS", 2: "UNK", 3:"PAD"}
        self.word_to_count = {}
        self.vocab_size = 4
        self.src = src

    def add_word(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.vocab_size
            self.index_to_word[self.vocab_size] = word
            self.vocab_size += 1
        self.word_to_count[word] = self.word_to_count.get(word, 0) + 1

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def sentence_to_indexes(self, sentence):
        idxs = [self.word_to_index[word] if word in self.word_to_index else self.word_to_index["UNK"] for word in sentence.split(' ')]
        return idxs
        
    def indexes_to_sentence(self, indexes):
        return ' '.join([self.index_to_word[index] for index in indexes])

    def sentence_to_tensor(self, sentence):
        indexes = self.sentence_to_indexes(sentence)
        if self.src:
            indexes = indexes + [EOS_token]
        else:
            # target language sentence
            indexes = [SOS_token] + indexes + [EOS_token]

        mask = [1]*len(indexes)
        if len(indexes) < MAX_SEQ_LEN:
            mask += [0]*(MAX_SEQ_LEN-len(indexes))
            indexes += [PAD]*(MAX_SEQ_LEN-len(indexes))
        elif len(indexes) > MAX_SEQ_LEN:
            indexes = indexes[:MAX_SEQ_LEN]
            mask = mask[:MAX_SEQ_LEN]
        indexes = torch.tensor(indexes, dtype=torch.long)
        mask = torch.tensor(mask, dtype=torch.float32)
        return indexes, mask #.view(-1, 1)

    def tensor_to_sentence(self, idx_tensor):
        if len(idx_tensor.shape) > 1:
            idxs = idx_tensor.tolist()[0]
        else:
            idxs = idx_tensor.tolist()
        sentence = self.indexes_to_sentence(idxs)
        return sentence

In [227]:
# create language instances
src_lang = Language(src_language)
target_lang = Language(target_language)

for src, target in pairs:
    src_lang.add_sentence(src)
    target_lang.add_sentence(target) 

Since there are a lot of example sentences and we want to train something quickly, we’ll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we’re filtering to sentences that translate to the form “I am” or “He is” etc. (accounting for apostrophes replaced earlier).

In [270]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p, reverse=False):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1 if reverse else 0].startswith(eng_prefixes)


def filterPairs(pairs, reverse):
    return [pair for pair in pairs if filterPair(pair, reverse)]

The full process for preparing the data is:

Read text file and split into lines, split lines into pairs

Normalize text, filter by length and content

Make word lists from sentences in pairs

In [271]:
def create_dataset(src_lang, target_lang, reverse=False):
    pairs = load_data('eng-fra.txt', reverse)

    # create language instances
    input_lang = Language(src_lang)
    output_lang = Language(target_lang, src=False)

    print("Read %s sentence pairs" % len(pairs))

    pairs = filterPairs(pairs, reverse)
    print("Trimmed to %s sentence pairs" % len(pairs))

    # train/val/test split
    n_total = len(pairs)
    n_train = int(0.8*n_total)
    n_val = int(0.1*n_total)
    n_test = n_total - n_train - n_val
    print(f"{n_train=}, {n_val=}, {n_test=}")
    pair_split = {}
    pair_split['train'] = pairs[:n_train]
    pair_split['val'] = pairs[n_train:n_train + n_val]
    pair_split['test'] = pairs[n_train + n_val:]
    
    print("Counting words...")
    print("Creating source and target language vocab using pair_split['train']...")
    for src, target in pair_split['train']:
        input_lang.add_sentence(src)
        output_lang.add_sentence(target) 

    print("Counted words:")
    print(input_lang.lang_name, input_lang.vocab_size)
    print(output_lang.lang_name, output_lang.vocab_size)
    return input_lang, output_lang, pair_split

In [272]:
src_lang, target_lang, pair_split = create_dataset('eng', 'fra', reverse=False)
print(random.choice(pair_split["train"]))

Reading text file...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
n_train=8479, n_val=1059, n_test=1061
Counting words...
Creating source and target language vocab using pair_split['train']...
Counted words:
eng 2184
fra 3526
['he s already married .', 'il est deja marie .']


#### Tokenization: Words to indexes
In seq2seq task takes an input sequence (source seq) and outputs another sequence (target seq). In our case, we have an input sentence in English language and a corresponding translated sentence in French language. These sentence need to converted into numbers (integers) to be able to into to a neural network. For this, we will use **word-level tokenization**, i.e *word to integer* index mapping.

We need some special tokens to indicate start (SOS) and end (EOS) of a sentence. For the input sequence (source seq), the model needs to know when the input has ended and for the target sequence, the model needs to know when to start and when to end.

So, we will append the EOS token to the end of input sentence and wrap the target sentence by SOS (in the beginning) and the EOS (in the end) tokens.

In [273]:

def sent_pair_to_tensor_pair(pair, src_lang, target_lang):
    input_tensor, input_mask = src_lang.sentence_to_tensor(pair[0])
    target_tensor, target_mask = target_lang.sentence_to_tensor(pair[1])
    return (input_tensor, input_mask), (target_tensor, target_mask)

def tensor_pair_to_sent_pair(pair, src_lang, target_lang):
    input_tensor = src_lang.tensor_to_sentence(pair[0])
    target_tensor = target_lang.tensor_to_sentence(pair[1])
    return (input_tensor, target_tensor)

In [277]:
pair = random.choices(pair_split["train"], k=1)[0]
print(pair)
print(sent_pair_to_tensor_pair(pair, src_lang, target_lang))
print(sent_pair_to_tensor_pair(pair, src_lang, target_lang))

p1, p2 = sent_pair_to_tensor_pair(pair, src_lang, target_lang)
print(tensor_pair_to_sent_pair((p1[0],p2[0]), src_lang, target_lang))

['i m innocent .', 'je suis ingenu .']
((tensor([  4,   5, 269,   6,   1,   3,   3,   3,   3,   3,   3,   3]), tensor([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.])), (tensor([  0,   8,  13, 463,   7,   1,   3,   3,   3,   3,   3,   3]), tensor([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.])))
((tensor([  4,   5, 269,   6,   1,   3,   3,   3,   3,   3,   3,   3]), tensor([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.])), (tensor([  0,   8,  13, 463,   7,   1,   3,   3,   3,   3,   3,   3]), tensor([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.])))
('i m innocent . EOS PAD PAD PAD PAD PAD PAD PAD', 'SOS je suis ingenu . EOS PAD PAD PAD PAD PAD PAD')


In [303]:

def build_batch(split, batch_size=4):
    if split == "train":
        pairs = pair_split["train"]
    elif split == "val":
        pairs = pair_split["val"]
    else:
        pairs = pair_split["test"]
    # randomly (uniformly) sample a start index for a sentence of length block_size
    # number of sequences in a batch is batch_size
    batch_pairs = random.choices(pairs, k=batch_size)
    # input 
    src_batch = []
    src_mask = []
    target_batch = []
    target_mask = []

    max_src_seq_len = 0
    max_target_seq_len = 0
    
    for pair in batch_pairs:
        max_src_seq_len = max(max_src_seq_len, len(pair[0].split())+1)
        max_target_seq_len = max(max_target_seq_len, len(pair[1].split())+2)
    
        input_tensors, target_tensors = sent_pair_to_tensor_pair(pair, src_lang, target_lang)
        src_batch.append(input_tensors[0])
        src_mask.append(input_tensors[1])
        target_batch.append(target_tensors[0])
        target_mask.append(target_tensors[1])
        
    enc_inp = torch.stack(src_batch)[:, :max_src_seq_len].to(device)
    enc_inp_mask = torch.stack(src_mask)[:, :max_src_seq_len].to(device)
    
    target_batch = torch.stack(target_batch)[:, :max_target_seq_len].to(device)
    target_mask = torch.stack(target_mask)[:, :max_target_seq_len].to(device)

    dec_inp = target_batch[:,:-1]
    dec_target = target_batch[:,1:]

    batch = {"enc_inp":enc_inp,
            "enc_inp_mask":enc_inp_mask,
            "dec_inp":dec_inp,
            "dec_target":dec_target}
    return batch

In [304]:
build_batch("train", batch_size=4)

{'enc_inp': tensor([[  16,   17,   44,  462,  599,    6,    1],
         [   4,   18,  753,    6,    1,    3,    3],
         [ 131,   80,  217, 1130,    6,    1,    3],
         [   4,    5,  527,  352,  471,    6,    1]]),
 'enc_inp_mask': tensor([[1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1., 1., 1.]]),
 'dec_inp': tensor([[   0,   26,   27,   68,  637, 1111,    7],
         [   0,    8,   13,  331,    7,    1,    3],
         [   0,  212, 2243,  115,    7,    1,    3],
         [   0,    8,   13,  964,  645, 1200,    7]]),
 'dec_target': tensor([[  26,   27,   68,  637, 1111,    7,    1],
         [   8,   13,  331,    7,    1,    3,    3],
         [ 212, 2243,  115,    7,    1,    3,    3],
         [   8,   13,  964,  645, 1200,    7,    1]])}

#### Seq2Seq English-French Machine Translation Transformer Model