In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import ast

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
import numpy as np

use_cuda = torch.cuda.is_available()

In [9]:
class Words:
    def __init__(self):
        self.SOS_token = 0
        self.EOS_token = 1
        self.word2index = {}
        self.index2word = {self.SOS_token: "SOS", self.EOS_token: "EOS"}
        self.word2count = {}
        self.n_words = 2
        
    def __addArray(self, array):
        for word in array:
            self.__addWord(word)
            
    def addArrayOfTuples(self, array_of_tuples):
        for pair in array_of_tuples:
            self.__addArray(pair[0])
            self.__addArray(pair[1])
    
    def __addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [10]:
words = Words()
words.addArrayOfTuples(clean_tuples)

In [11]:
class EncoderRNN(nn.Module):
    """
    Simple encoder network that embeds the character and then feeds through a GRU
    """
    def __init__(self, input_size, hidden_size, batch_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden
    
    def initHidden(self):
        result = torch.zeros(1, self.batch_size, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result
        
class AttnDecoderRNN(nn.Module):
    """
    Attn Decoder
    1. Need max length because learning which input words to attend to
    And thus need to know the maximum number of words could attend to
    2. The attn_weights tell us how much to weight each input word - in this case French,
       In order to predict the english word.
    """
    def __init__(self, input_size, hidden_size, batch_size,
                 dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(hidden_size, hidden_size)
        # note input and output same size
        self.linear = nn.Linear(hidden_size, input_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.attn_layer = nn.Linear(2 * self.hidden_size, MAX_LENGTH)
        self.out_layer = nn.Linear(self.hidden_size, input_size)
        self.attn_combined_layer = nn.Linear(2 * self.hidden_size, self.hidden_size)
    
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        embedded = self.dropout(embedded)
        attn = self.attn_layer(torch.cat((embedded[0], hidden[0]),dim=1))
        attn_weights = self.softmax(attn)
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs) #shape: bx1xh
        attn_combined = torch.cat((embedded[0], attn_applied[:,0,:]), 1)
        attn_combined = self.relu(self.attn_combined_layer(attn_combined).unsqueeze(0))
        output, hidden = self.gru(attn_combined, hidden)
        output = self.softmax(self.out_layer(output[0]))
        return output, hidden, attn_weights
    
    def initHidden(self):
        result = torch.zeros(1, self.batch_size, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result        

In [12]:
teacher_forcing_ratio = 0.5

def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
         decoder_optimizer, criterion, batch_size, SOS_token=words.SOS_token, max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()
    loss = 0
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_variable.size()[1]
    target_length = target_variable.size()[1]
    
    encoder_outputs = torch.zeros((batch_size, MAX_LENGTH, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else decoder_input
    
    # Here we are feeding in the english words to get the final hidden state 
    # for the decoder
    for i in range(input_length):
        encoder_ouput, encoder_hidden = encoder.forward(input_variable[:,i,:], encoder_hidden)
        encoder_outputs[:,i,:] = encoder_ouput[0]
        
    decoder_hidden = encoder_hidden
    decoder_input = torch.LongTensor([[SOS_token]]*batch_size)
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    # Here we take the final hidden state from the encoder
    # And feed it to decoder
    # We also give decoder the word to predict the next word starting with SOS token
    # If use teacher forcing then give it the truth, otherwise give it prediction
    if use_teacher_forcing:
        for i in range(target_length):
            decoder_output, decoder_hidden, attn_weights = decoder.forward(decoder_input, 
                                                                           decoder_hidden,
                                                                          encoder_outputs)

            loss += criterion(decoder_output, target_variable[:,i,0])
            decoder_input = target_variable[:,i,:]
            
    else:
        for i in range(target_length):
            decoder_output, decoder_hidden, attn_weights = decoder.forward(decoder_input, 
                                                                           decoder_hidden,
                                                                           encoder_outputs)
            loss += criterion(decoder_output, target_variable[:,i,0])
            topv, topi = decoder_output.data.topk(1)
            decoder_input = topi
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

                
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length

In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [15]:
def trainIters(encoder, decoder, data_loader, epochs, batch_size, print_every=1000,
               plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()
    iter = 1
    n_iters = len(data_loader) * epochs
    for epoch in range(1, epochs + 1):
        for i_batch, sample_batched in enumerate(data_loader):
            
            
            loss = train(sample_batched[0], sample_batched[1], encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion,
                        batch_size)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))
            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
            iter = iter + 1
            
    showPlot(plot_losses)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, words, max_length):
        self.data = data
        self.words = words
        self.max_length = max_length

    def __getitem__(self, index):
        row = self.data[index]
        training_pairs = self.tensorFromPair(row)
        return (training_pairs[0], training_pairs[1])
    
    def indexesFromSentence(self, sentence):
        return [self.words.word2index[word] for word in sentence]

    def tensorFromSentence(self, sentence):
        indexes = self.indexesFromSentence(sentence)
        indexes.append(self.words.EOS_token)
        # make it 1 column with number of rows equal to words in sentence
        result = torch.LongTensor(indexes).view(-1, 1)
        pad_amount = self.max_length - result.size(0)
        if pad_amount > 0:
            result = F.pad(result, (0,0,0,pad_amount), value=self.words.EOS_token).data
        result = result.cuda() if use_cuda else result
        return result

    def tensorFromPair(self, pair):
        input_variable = self.tensorFromSentence(pair[0])
        output_variable = self.tensorFromSentence(pair[1])
        return (input_variable, output_variable)
        
    def __len__(self):
        return len(self.data)

batch_size = 256
training_dataset = CustomDataset(clean_tuples, words, MAX_LENGTH) # add 1 for EOS
training_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True,
                                drop_last=True)

In [None]:
hidden_size = 256
encoder = EncoderRNN(words.n_words, hidden_size, batch_size)
decoder = AttnDecoderRNN(words.n_words, hidden_size, batch_size)

if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
trainIters(encoder, decoder, training_dataloader, 20, batch_size, print_every=50,
          plot_every=5)



0m 30s (- 170m 13s) (50 0%) 13.8939
1m 0s (- 169m 27s) (100 0%) 15.5410
1m 29s (- 168m 10s) (150 0%) 13.3282
1m 59s (- 167m 59s) (200 1%) 12.5567
2m 29s (- 167m 38s) (250 1%) 13.8714
2m 59s (- 167m 3s) (300 1%) 13.2945
3m 29s (- 166m 28s) (350 2%) 13.5684
3m 59s (- 165m 54s) (400 2%) 13.2567
4m 29s (- 165m 26s) (450 2%) 13.4829
4m 59s (- 164m 58s) (500 2%) 12.1789
5m 28s (- 164m 21s) (550 3%) 12.4293
5m 58s (- 163m 50s) (600 3%) 13.0382
6m 29s (- 163m 30s) (650 3%) 11.9227
6m 59s (- 163m 0s) (700 4%) 12.3920
7m 29s (- 162m 38s) (750 4%) 12.1851
7m 58s (- 162m 2s) (800 4%) 12.1708
8m 28s (- 161m 32s) (850 4%) 11.2850
8m 58s (- 161m 4s) (900 5%) 10.7802
9m 28s (- 160m 33s) (950 5%) 11.7264
9m 58s (- 159m 59s) (1000 5%) 11.6277
10m 28s (- 159m 25s) (1050 6%) 11.6710
10m 58s (- 158m 57s) (1100 6%) 12.7896
11m 28s (- 158m 28s) (1150 6%) 11.8657
11m 57s (- 157m 57s) (1200 7%) 12.1697
12m 27s (- 157m 27s) (1250 7%) 11.3007
12m 57s (- 156m 55s) (1300 7%) 12.2772
13m 27s (- 156m 29s) (1350 7%) 

In [None]:
torch.save(encoder.state_dict(), "../models/attn_encoder.state")
torch.save(decoder.state_dict(), "../models/attn_decoder.state")
print("Model Saved")

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    
    encoder_hidden = encoder.initHidden()
    input_variable = variableFromSentence(input_lang, sentence)
    input_length = input_variable.size()[0]

    for i in range(input_length):
        encoder_ouput, encoder_hidden = encoder.forward(input_variable[i], encoder_hidden)
        
    decoder_hidden = encoder_hidden
    decoder_input = torch.LongTensor([[SOS_token]])
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoded_words = []
    for i in range(MAX_LENGTH):
        decoder_output, decoder_hidden = decoder.forward(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        top_predicted = topi[0][0]

        decoder_input = torch.LongTensor([[top_predicted]])
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        if top_predicted == EOS_token:
            decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(output_lang.index2word[top_predicted])
    return decoded_words

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print(">", pair[0])
        print("=", pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        print("<", " ".join(output_words))
        print('')

In [None]:
evaluateRandomly(encoder, decoder)

In [None]:
evaluate(encoder, decoder, "i m going to teach .")