TODO
- Import
- Read data (vocab, sentences)
- Build model
- Train model

In [1]:
use_cuda = True
batch_size = 1
learning_rate = 0.001

# Import library

In [2]:
from __future__ import print_function

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

import time
import math
import random
import unicodedata
import string
import re

import scripts.text
import utils

# Load data

In [3]:
data_path = './processed-data/id.1000/'
en_vocab_path = data_path + 'train.10k.en.vocab'
de_vocab_path = data_path + 'train.10k.de.vocab'

In [4]:
en_words, en_vocab, _ = scripts.text.load_vocab(en_vocab_path)
de_words, de_vocab, _ = scripts.text.load_vocab(de_vocab_path)

# Loading vocab file ./processed-data/id.1000/train.10k.en.vocab ...
  num words = 1000
# Loading vocab file ./processed-data/id.1000/train.10k.de.vocab ...
  num words = 1000


In [5]:
# Read train data
en_train_sentences = []
with open(data_path + 'train.10k.en', 'r') as f:
    for line in f:
        en_train_sentences.append(map(lambda x: int(x), line.split()))
        
de_train_sentences = []
with open(data_path + 'train.10k.de', 'r') as f:
    for line in f:
        de_train_sentences.append(map(lambda x: int(x), line.split()))

In [6]:
# Read validation data
en_valid_sentences = []
with open(data_path + 'valid.100.en', 'r') as f:
    for line in f:
        en_valid_sentences.append(map(lambda x: int(x), line.split()))
        
de_valid_sentences = []
with open(data_path + 'valid.100.de', 'r') as f:
    for line in f:
        de_valid_sentences.append(map(lambda x: int(x), line.split()))

# Build model

## Using RNNs

In [7]:
class EncoderRNN(nn.Module):
    """
        Model's encoder using RNN.
    """

    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers)

    def forward(self, input_sentence, hidden):
        sentence_len = len(input_sentence)
        
        embedded = self.embedding(input_sentence)
        embedded = embedded.view(sentence_len, batch_size, -1)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        if use_cuda:
            hidden = hidden.cuda()
        return hidden

In [8]:
class DecoderRNN(nn.Module):
    """
        Model's decoder using RNN.
    """

    def __init__(self, embedding_size, hidden_size, output_size, num_layers=1):
        super(DecoderRNN, self).__init__()

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_vector, hidden):
        output = self.embedding(input_vector).view(1, batch_size, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.log_softmax(self.out(output[0]))
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        if use_cuda:
            hidden = hidden.cuda()
        return hidden

# Training model

## Using RNN

### Checking the model

In [None]:
encoder_test = EncoderRNN(10, 10, 20, 2)
decoder_test = DecoderRNN(10, 20, 10, 2)
print(encoder_test)
print(decoder_test)

In [None]:
encoder_hidden = encoder_test.init_hidden()
word_input = Variable(torch.LongTensor([1, 2, 3]))
if use_cuda:
    encoder_test.cuda()
    word_input = word_input.cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = Variable(torch.LongTensor([1, 2, 3, 4]))
decoder_hidden = encoder_hidden

if use_cuda:
    decoder_test.cuda()
    word_inputs = word_inputs.cuda()

for i in range(4):
    decoder_output, decoder_hidden = decoder_test(word_inputs[i], decoder_hidden)
    print(decoder_output)
    print(decoder_hidden)

### Define training

In [9]:
teacher_forcing_ratio = 0.5
clip = 5.0
MAX_LENGTH = 50

In [10]:
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    # Zero gradient
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Prepare input for decoder and output variables
    decoder_input = Variable(torch.LongTensor([[de_vocab['<s>']]]))
    decoder_hidden = encoder_hidden  # Use last hidden from the encoder

    if use_cuda:
        decoder_input = decoder_input.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        # Teacher forcing: use the ground-truth target as the next input
        for d_i in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_variable[d_i])
            decoder_input = target_variable[d_i]
    else:
        # Without teacher forcing use its own predictions as the next input
        for d_i in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
#             print(decoder_output)
            loss += criterion(decoder_output, target_variable[d_i])
            # Pick most likely word index (highest value) from output (greedy search)
            top_value, top_index = decoder_output.data.topk(1)
            n_i = top_index[0][0]
#             print(n_i)
#             print(torch.LongTensor([n_i]))
            decoder_input = Variable(torch.LongTensor([[n_i]])) # Chosen word is next input
            
            if use_cuda:
                decoder_input = decoder_input.cuda()

            # Stop at end of sentence (not necessary when using known targers)
            if n_i == en_vocab['</s>']:
                break
    # Backpropagation
    loss.backward()
#     nn.utils.clip_grad_norm(encoder.parameters(), clip)
#     nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length

### Run training

In [11]:
embedding_size = 500
hidden_size = 500
num_layers = 4
dropout_p = 0.00

# Initialize models
encoder = EncoderRNN(len(en_vocab), embedding_size, hidden_size, num_layers)
decoder = DecoderRNN(embedding_size, hidden_size, len(de_vocab), num_layers)

# Move models to GPU
if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
# Initialize parameters and criterion
# learning_rate = 0.0001
# encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate, momentum=0.9)
# decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate, momentum=0.9)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [12]:
# Configuring training
num_epochs = 1
plot_every = 100
print_every = 100

# Keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0 # Reset every print every
plot_loss_total = 0 # Reset every plot every

In [None]:
# Convert all sentences to Variable
if use_cuda:
    for i in range(len(en_train_sentences)):
        en_train_sentences[i] = Variable(torch.LongTensor(en_train_sentences[i]).view(-1, 1)).cuda()
        de_train_sentences[i] = Variable(torch.LongTensor(de_train_sentences[i]).view(-1, 1)).cuda()
else:
    for i in range(len(en_train_sentences)):
        en_train_sentences[i] = Variable(torch.LongTensor(en_train_sentences[i]).view(-1, 1))
        de_train_sentences[i] = Variable(torch.LongTensor(de_train_sentences[i]).view(-1, 1))

if use_cuda:
    for i in range(len(en_valid_sentences)):
        en_valid_sentences[i] = Variable(torch.LongTensor(en_valid_sentences[i]).view(-1, 1)).cuda()
        de_valid_sentences[i] = Variable(torch.LongTensor(de_valid_sentences[i]).view(-1, 1)).cuda()
else:
    for i in range(len(en_valid_sentences)):
        en_valid_sentences[i] = Variable(torch.LongTensor(en_valid_sentences[i]).view(-1, 1))
        de_valid_sentences[i] = Variable(torch.LongTensor(de_valid_sentences[i]).view(-1, 1))

In [None]:
start = time.time()
for epoch in range(0, num_epochs):
    #start epoch
    # Shuffle
    indexes = np.arange(0, len(en_train_sentences))
    np.random.shuffle(indexes)
    step = 1
    num_steps = math.ceil(len(en_train_sentences) / batch_size)
    for index in indexes:
        input_variable = en_train_sentences[index]
        target_variable = de_train_sentences[index]
        loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
                     decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if step == 0:
            step += 1
            continue
        
        if step % print_every == 0 or step == num_steps:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = 'Epoch %s/%s, Time: %s, Step: %d/%d, train_loss: %.4f' % (epoch, num_epochs,
                                                                utils.time_since(start, step / num_steps),
                                                                step,
                                                                num_steps, print_loss_avg)
            print(print_summary)
        
        if step % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total
        step += 1
    # end epoch
    # evaluate on validation set
    valid_total_loss = 0
    for i in range(len(en_valid_sentences)):
        input_variable = en_valid_sentences[i]
        output_varible = de_valid_sentences[i]
        valid_loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
                     decoder_optimizer, criterion)
        valid_total_loss += valid_loss
    print('Validation loss: %.4f' % (valid_total_loss / len(en_valid_sentences)))
        

Epoch 0/1, Time: 0m 18s (- 31m 3s), Step: 100/10000, train_loss: 4.8068
Epoch 0/1, Time: 0m 36s (- 29m 43s), Step: 200/10000, train_loss: 4.7536
Epoch 0/1, Time: 0m 51s (- 27m 32s), Step: 300/10000, train_loss: 4.5545
Epoch 0/1, Time: 1m 6s (- 26m 27s), Step: 400/10000, train_loss: 4.5978
Epoch 0/1, Time: 1m 22s (- 26m 14s), Step: 500/10000, train_loss: 4.4612
Epoch 0/1, Time: 1m 38s (- 25m 44s), Step: 600/10000, train_loss: 4.6629
Epoch 0/1, Time: 1m 55s (- 25m 37s), Step: 700/10000, train_loss: 4.6192
Epoch 0/1, Time: 2m 12s (- 25m 25s), Step: 800/10000, train_loss: 4.4722
Epoch 0/1, Time: 2m 31s (- 25m 31s), Step: 900/10000, train_loss: 4.3995
Epoch 0/1, Time: 2m 47s (- 25m 11s), Step: 1000/10000, train_loss: 4.5654


### Evaluating the model

In [None]:
def evaluate(sentence, max_length=MAX_LENGTH):
    input_variable = Variable(torch.LongTensor(scripts.text.to_id(sentence.split(), en_vocab)))
    print(input_variable)
    if use_cuda:
        input_variable = input_variable.cuda()
    
    input_length = len(input_variable)
    
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    
    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([[de_vocab['<s>']]]))
    decoder_hidden = encoder_hidden
    
    if use_cuda:
        decoder_input = decoder_input.cuda()
    
    decoded_words = []
    
    # Run through decoder
    for d_i in range(max_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden)
        # Pick most likely word index (highest value) from output (greedy search)
        top_value, top_index = decoder_output.data.topk(1)
        n_i = top_index[0][0]
        print(n_i)
        decoded_words += scripts.text.to_text([n_i], de_words)

        # Stop at end of sentence (not necessary when using known targers)
        if n_i == de_vocab['</s>']:
            break

        decoder_input = Variable(torch.LongTensor([[n_i]])) # Chosen word is next input

        if use_cuda:
            decoder_input = decoder_input.cuda()

            
    return decoded_words

In [None]:
def evaluate_sentence(s):
    valid_sentence = s
    
    output_words = evaluate(valid_sentence)
    output_sentence = ' '.join(output_words)
    
    print('>', valid_sentence)
#     print('=', pair[1])
    print('<', output_sentence)
    print('')

In [None]:
evaluate_sentence('i am a student and he is a teacher')

In [None]:
evaluate_sentence('luck is no excuse and who has luck is successful')