TODO
- Import
- Read data (vocab, sentences)
- Build model using Luong attention
- Train model
- Evaluating model
- Compute BLEU score

In [1]:
use_cuda = True
batch_size = 1
learning_rate = 0.001
MAX_LENGTH = 50

# Import library

In [2]:
from __future__ import print_function

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

import time
import math
import random
import unicodedata
import string
import re

import scripts.text
import utils

# Load data

In [3]:
data_path = './processed-data/id.1000/'
en_vocab_path = data_path + 'train.10k.en.vocab'
de_vocab_path = data_path + 'train.10k.de.vocab'

In [4]:
en_words, en_vocab, _ = scripts.text.load_vocab(en_vocab_path)
de_words, de_vocab, _ = scripts.text.load_vocab(de_vocab_path)

# Loading vocab file ./processed-data/id.1000/train.10k.en.vocab ...
  num words = 1000
# Loading vocab file ./processed-data/id.1000/train.10k.de.vocab ...
  num words = 1000


In [5]:
# Read train data
en_train_sentences = []
with open(data_path + 'train.10k.en', 'r') as f:
    for line in f:
        en_train_sentences.append(map(lambda x: int(x), line.split()))
        
de_train_sentences = []
with open(data_path + 'train.10k.de', 'r') as f:
    for line in f:
        de_train_sentences.append(map(lambda x: int(x), line.split()))

In [6]:
# Read validation data
en_valid_sentences = []
with open(data_path + 'valid.100.en', 'r') as f:
    for line in f:
        en_valid_sentences.append(map(lambda x: int(x), line.split()))
        
de_valid_sentences = []
with open(data_path + 'valid.100.de', 'r') as f:
    for line in f:
        de_valid_sentences.append(map(lambda x: int(x), line.split()))

# Build model

## Using RNNs + Attention

In [7]:
class EncoderRNN(nn.Module):
    """
        Model's encoder using RNN.
    """

    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers)

    def forward(self, input_sentence, hidden):
        sentence_len = len(input_sentence)
        
        embedded = self.embedding(input_sentence)
        embedded = embedded.view(sentence_len, batch_size, -1)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        if use_cuda:
            hidden = hidden.cuda()
        return hidden

In [None]:
class DecoderRNN(nn.Module):
    """
        Model's decoder using RNN.
    """

    def __init__(self, embedding_size, hidden_size, output_size, num_layers=1):
        super(DecoderRNN, self).__init__()

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_vector, hidden):
        output = self.embedding(input_vector).view(1, batch_size, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.log_softmax(self.out(output[0]))
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        if use_cuda:
            hidden = hidden.cuda()
        return hidden

In [8]:
class Attention(nn.Module):
    """
        Attention class.
    """
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
        super(Attention, self).__init__()

        self.method = method
#         self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attention = nn.Linear(self.hidden_size, hidden_size)
            
        elif self.method == 'concat':
            self.attention = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
            
    def forward(self, hidden, encoder_outputs):
        sequence_len = len(encoder_outputs)
        
        # Create variable to store attention weights
        attention_energies = Variable(torch.zeros(sequence_len)) # B x 1 x S
        if use_cuda:
            attention_energies = attention_energies.cuda()
        
        # Calculate energies for each encoder output
        for i in range(sequence_len):
            attention_energies[i] = self.score(hidden, encoder_outputs[i])
        
        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x sequence length
        return F.softmax(attention_energies).view(1, 1, sequence_len)
    
    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy = hidden.dot(energy)
            return energy
        elif self.method == 'general':
            energy = self.attention(encoder_output)
            energy = hidden.dot(energy)
            return energy
        elif self.method == 'concat':
            energy = self.attention(torch.cat((hidden, encoder_output), 1))
            energy = self.other.dot(energy)
            return energy

In [9]:
class AttentionDecoderRNN(nn.Module):
    """
        Decoder using Attention mechanism.
    """
    def __init__(self, attention_model, hidden_size, output_size, num_layers=1,
                 dropout_p=0.1):
        super(AttentionDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.attention_model = attention_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, num_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size * 2, output_size)
        
        # Choose attention model
        if attention_model != 'none':
            self.attention = Attention(attention_model, hidden_size)
        
    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        # Combine embedded input word and last context, run through RNN
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)
        
        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attention_weights = self.attention(rnn_output.squeeze(0), encoder_outputs)
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        # Final output layer (next word prediction) using the RNN hidden state and context vector
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1) # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, context, hidden, attention_weights

# Training model

## Using RNN + Attention

### Checking the model

In [10]:
encoder_test = EncoderRNN(10, 10, 10, 1)
decoder_test = AttentionDecoderRNN('general', 10, 10, 1)
print(encoder_test)
print(decoder_test)

EncoderRNN(
  (embedding): Embedding(10, 10)
  (rnn): GRU(10, 10)
)
AttentionDecoderRNN(
  (embedding): Embedding(10, 10)
  (gru): GRU(20, 10, dropout=0.1)
  (out): Linear(in_features=20, out_features=10, bias=True)
  (attention): Attention(
    (attention): Linear(in_features=10, out_features=10, bias=True)
  )
)


In [11]:

encoder_hidden = encoder_test.init_hidden()
word_input = Variable(torch.LongTensor([1, 2, 3]))
if use_cuda:
    encoder_test.cuda()
    word_input = word_input.cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = Variable(torch.LongTensor([1, 2, 3]))
decoder_attns = torch.zeros(1, 3, 3).cuda()
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))

if use_cuda:
    decoder_test.cuda()
    word_inputs = word_inputs.cuda()
    decoder_context = decoder_context.cuda()

for i in range(3):
    decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[i], decoder_context, decoder_hidden, encoder_outputs)
    print(decoder_output.size(), decoder_hidden.size(), decoder_attn.size())
    decoder_attns[0, i] = decoder_attn.squeeze(0).data

torch.Size([1, 10]) torch.Size([1, 1, 10]) torch.Size([1, 1, 3])
torch.Size([1, 10]) torch.Size([1, 1, 10]) torch.Size([1, 1, 3])
torch.Size([1, 10]) torch.Size([1, 1, 10]) torch.Size([1, 1, 3])




### Define training

In [10]:
teacher_forcing_ratio = 0.5
clip = 5.0
MAX_LENGTH = 50

In [11]:
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    # Zero gradient
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Prepare input for decoder and output variables
    decoder_input = Variable(torch.LongTensor([[de_vocab['<s>']]]))
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    decoder_hidden = encoder_hidden  # Use last hidden from the encoder

    if use_cuda:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        # Teacher forcing: use the ground-truth target as the next input
        for d_i in range(target_length):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[d_i])
            decoder_input = target_variable[d_i]
    else:
        # Without teacher forcing use its own predictions as the next input
        for d_i in range(target_length):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_context, decoder_hidden, encoder_outputs)
#             print(decoder_output)
            loss += criterion(decoder_output, target_variable[d_i])
            # Pick most likely word index (highest value) from output (greedy search)
            top_value, top_index = decoder_output.data.topk(1)
            n_i = top_index[0][0]
#             print(n_i)
#             print(torch.LongTensor([n_i]))
            decoder_input = Variable(torch.LongTensor([[n_i]])) # Chosen word is next input
            
            if use_cuda:
                decoder_input = decoder_input.cuda()

            # Stop at end of sentence (not necessary when using known targers)
            if n_i == en_vocab['</s>']:
                break
    # Backpropagation
    loss.backward()
    nn.utils.clip_grad_norm(encoder.parameters(), clip)
    nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length

### Run training

In [12]:
embedding_size = 500
hidden_size = 500
num_layers = 1
dropout_p = 0.00

# Initialize models
encoder = EncoderRNN(len(en_vocab), embedding_size, hidden_size, num_layers)
decoder = AttentionDecoderRNN('dot', hidden_size, len(de_vocab), num_layers)

# Move models to GPU
if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
# Initialize parameters and criterion
# learning_rate = 0.0001
# encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate, momentum=0.9)
# decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate, momentum=0.9)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [13]:
# Configuring training
num_epochs = 1
plot_every = 100
print_every = 100

# Keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0 # Reset every print every
plot_loss_total = 0 # Reset every plot every

In [14]:
# Convert all sentences to Variable
if use_cuda:
    for i in range(len(en_train_sentences)):
        en_train_sentences[i] = Variable(torch.LongTensor(en_train_sentences[i]).view(-1, 1)).cuda()
        de_train_sentences[i] = Variable(torch.LongTensor(de_train_sentences[i]).view(-1, 1)).cuda()
else:
    for i in range(len(en_train_sentences)):
        en_train_sentences[i] = Variable(torch.LongTensor(en_train_sentences[i]).view(-1, 1))
        de_train_sentences[i] = Variable(torch.LongTensor(de_train_sentences[i]).view(-1, 1))

if use_cuda:
    for i in range(len(en_valid_sentences)):
        en_valid_sentences[i] = Variable(torch.LongTensor(en_valid_sentences[i]).view(-1, 1)).cuda()
        de_valid_sentences[i] = Variable(torch.LongTensor(de_valid_sentences[i]).view(-1, 1)).cuda()
else:
    for i in range(len(en_valid_sentences)):
        en_valid_sentences[i] = Variable(torch.LongTensor(en_valid_sentences[i]).view(-1, 1))
        de_valid_sentences[i] = Variable(torch.LongTensor(de_valid_sentences[i]).view(-1, 1))

In [15]:
start = time.time()
for epoch in range(0, num_epochs):
    #start epoch
    # Shuffle
    indexes = np.arange(0, len(en_train_sentences))
    np.random.shuffle(indexes)
    step = 1
    num_steps = math.ceil(len(en_train_sentences) / batch_size)
    for index in indexes:
        input_variable = en_train_sentences[index]
        target_variable = de_train_sentences[index]
        loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
                     decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if step == 0:
            step += 1
            continue
        
        if step % print_every == 0 or step == num_steps:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = 'Epoch %s/%s, Time: %s, Step: %d/%d, train_loss: %.4f' % (epoch, num_epochs,
                                                                utils.time_since(start, step / num_steps),
                                                                step,
                                                                num_steps, print_loss_avg)
            print(print_summary)
        
        if step % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total
        step += 1
    # end epoch
    # evaluate on validation set
    valid_total_loss = 0
    for i in range(len(en_valid_sentences)):
        input_variable = en_valid_sentences[i]
        output_varible = de_valid_sentences[i]
        valid_loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
                     decoder_optimizer, criterion)
        valid_total_loss += valid_loss
    print('Validation loss: %.4f' % (valid_total_loss / len(en_valid_sentences)))
        

RuntimeError: copy from NoneType to torch.cuda.FloatTensor isn't implemented

### Evaluating the model

In [None]:
def evaluate(sentence, max_length=MAX_LENGTH):
    input_variable = Variable(torch.LongTensor(scripts.text.to_id(sentence.split(), en_vocab)))
    if use_cuda:
        input_variable = input_variable.cuda()
    
    input_length = len(input_variable)
    
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    
    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([[de_vocab['<s>']]]))
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    decoder_hidden = encoder_hidden
    
    if use_cuda:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()
    
    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)
    # Run through decoder
    for d_i in range(max_length):
        decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input,
                    decoder_context, decoder_hidden, encoder_outputs)
        decoder_attentions[d_i, :decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
        # Pick most likely word index (highest value) from output (greedy search)
        top_value, top_index = decoder_output.data.topk(1)
        n_i = top_index[0][0]

        decoded_words += scripts.text.to_text([n_i], de_words)

        # Stop at end of sentence (not necessary when using known targers)
        if n_i == de_vocab['</s>']:
            break

        decoder_input = Variable(torch.LongTensor([[n_i]])) # Chosen word is next input

        if use_cuda:
            decoder_input = decoder_input.cuda()

            
    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

In [None]:
def evaluate_sentence(s):
    valid_sentence = s
    
    output_words, decoder_attention = evaluate(valid_sentence)
    output_sentence = ' '.join(output_words)
    
    print('>', valid_sentence)
#     print('=', pair[1])
    print('<', output_sentence)
    print('')

In [None]:
evaluate_sentence('a')