### English-French Neural Machine Translation

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [41]:
SOS_token = 0
EOS_token = 1


class Language:
    def __init__(self, lang_name):
        self.lang_name = lang_name
        self.word_to_index = {}
        self.index_to_word = {0: "SOS", 1: "EOS"}
        self.word_to_count = {}
        self.vocab_size = 2

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.vocab_size
            self.index_to_word[self.vocab_size] = word
            self.vocab_size += 1
        self.word_to_count[word] = self.word_to_count.get(word, 0) + 1

In [4]:
src_lang = 'eng'
target_lang = 'fra'

In [10]:
# Read the file and split into lines
with open('data/%s-%s.txt' % (src_lang, target_lang), encoding='utf-8') as file:
    text_data = file.read().splitlines()
print(text_data[:5])

['Go.\tVa !', 'Run!\tCours\u202f!', 'Run!\tCourez\u202f!', 'Wow!\tÇa alors\u202f!', 'Fire!\tAu feu !']


The text is in Unicode. So, we will take the following preprocessing steps:
1. Turn Unicode characters to ASCII
2. lowercase
3. Trim most punctuation

In [36]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(sent):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sent)
        if unicodedata.category(c) != 'Mn'
    )

def preprocess_string(sent):
    "lowercase, unicode_to_ascii, trim, and remove non-letter characters"
    sent = sent.lower().strip()
    sent = unicode_to_ascii(sent)
    # The backreference \1 (backslash one) references the first capturing group. 
    # space followed by \1 matches the exact same text that was matched by the first capturing group [.!?].
    sent = re.sub(r"([.!?])", r" \1", sent)
    # replace character which are not from this set (a-zA-Z.!?) by single space character
    sent = re.sub(r"[^a-zA-Z.!?]+", r" ", sent)
    return sent.strip()

In [37]:
preprocess_string(text_data[3])
print(text_data[3])
print(text_data[3].split('\t')[::-1])

Wow!	Ça alors !
['Ça alors\u202f!', 'Wow!']


In [39]:
def read_data(src_lang, target_lang, reverse=False):
    print("Reading text file...")

    # Read the file and split into lines
    with open('data/%s-%s.txt' % (src_lang, target_lang), encoding='utf-8') as file:
        lines = file.read().splitlines()

    # Split every line into pairs [src_lang, target_lang] and preprocess
    pairs = [[preprocess_string(s) for s in line.split('\t')] for line in lines]

    # Reverse pairs, make Language instances
    if reverse:
        pairs = [p[::-1] for p in pairs]
        input_lang = Language(target_lang)
        output_lang = Language(src_lang)
    else:
        input_lang = Language(src_lang)
        output_lang = Language(target_lang)

    return input_lang, output_lang, pairs

In [44]:
input_lang, output_lang, pairs = read_data(src_lang, target_lang, reverse=True)

Reading text file...


In [48]:
pairs[:2], input_lang, output_lang

([['va !', 'go .'], ['cours !', 'run !']],
 <__main__.Language at 0x7f83334b7b50>,
 <__main__.Language at 0x7f83334bdca0>)