In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
%matplotlib inline

use_cuda = torch.cuda.is_available()

In [3]:
import unicodedata
import re
SOS_token = 0
EOS_token = 1

class Lang(object):
    def __init__(self, name):
        self.name = name
        self.word2idx = {}
        self.word2count = {}
        self.idx2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        
    def add_sent(self, sent):
        for word in sent.split():
            self.add_word(word)
            
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.n_words
            self.word2count[word] = 1
            self.idx2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def unicode2ascii(s):
    return ''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])

def norm_string(s):
    s = unicode2ascii(s.lower().strip(' '))
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

with open('data/eng-fra.txt', encoding='utf-8') as f:
    pairs = [[norm_string(s) for s in line.split('\t')] for line in f]

len(pairs), pairs[:5]

(135842,
 [['go .', 'va ! '],
  ['run !', 'cours ! '],
  ['run !', 'courez ! '],
  ['wow !', 'ca alors ! '],
  ['fire !', 'au feu ! ']])

In [4]:
MAX_LEN = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

pairs = [pair for pair in pairs if len(pair[0].split(' ')) <= MAX_LEN and \
                                   len(pair[1].split(' ')) <= MAX_LEN and \
                                   pair[0].startswith(eng_prefixes)]
print(len(pairs), pairs[:5])

in_lang = Lang('eng')
out_lang = Lang('fra')
for pair in pairs:
    in_lang.add_sent(pair[0])
    out_lang.add_sent(pair[1])
print(in_lang.name, in_lang.n_words)
print(out_lang.name, out_lang.n_words)

11111 [['i m .', 'j ai ans . '], ['i m ok .', 'je vais bien . '], ['i m ok .', 'ca va . '], ['i m fat .', 'je suis gras . '], ['i m fat .', 'je suis gros . ']]
eng 2998
fra 4582


### GRU: gated recurrent unit

Math formulas: 
$$
\begin{array}{ll}
r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
\end{array}
$$

In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, in_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.emb = nn.Embedding(in_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, ins, hidden):
        emb_ins = self.emb(ins)
        outs = emb_ins.view(1, 1, -1)
        for i in range(self.n_layers):
            outs, hidden = self.gru(outs, hidden)
        return outs, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return hidden.cuda()
        else:
            return hidden
        
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, out_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.emb = nn.Embedding(out_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.outs = nn.Linear(hidden_size, out_size)
        
    def forward(self, ins, hidden):
        emb_outs = self.emb(ins)
        

In [7]:
nn.GRU?