TODO
- Import
- Read data (vocab, sentences)
- Build model
- Train model

In [1]:
use_cuda = True
batch_size = 1
learning_rate = 0.0001

# Import library

In [2]:
from __future__ import print_function

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

import time
import math
import random
import unicodedata
import string
import re

import scripts.text
import utils

# Load data

In [3]:
data_path = './processed-data/id.1000/'
en_vocab_path = data_path + 'train.10k.en.vocab'
de_vocab_path = data_path + 'train.10k.de.vocab'

In [4]:
en_words, en_vocab, _ = scripts.text.load_vocab(en_vocab_path)
de_words, de_vocab, _ = scripts.text.load_vocab(de_vocab_path)

# Loading vocab file ./processed-data/id.1000/train.10k.en.vocab ...
  num words = 1000
# Loading vocab file ./processed-data/id.1000/train.10k.de.vocab ...
  num words = 1000


In [5]:
# Read train data
en_train_sentences = []
with open(data_path + 'train.10k.en', 'r') as f:
    for line in f:
        en_train_sentences.append(map(lambda x: int(x), line.split()))
        
de_train_sentences = []
with open(data_path + 'train.10k.de', 'r') as f:
    for line in f:
        de_train_sentences.append(map(lambda x: int(x), line.split()))

In [6]:
# Read validation data
en_valid_sentences = []
with open(data_path + 'valid.100.en', 'r') as f:
    for line in f:
        en_valid_sentences.append(line.split())
        
de_valid_sentences = []
with open(data_path + 'valid.100.de', 'r') as f:
    for line in f:
        de_valid_sentences.append(line.split())

# Build model

## Using RNNs

In [7]:
class EncoderRNN(nn.Module):
    """
        Model's encoder using RNN.
    """

    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers)

    def forward(self, input_sentence, hidden):
        sentence_len = len(input_sentence)
        
        embedded = self.embedding(input_sentence)
        embedded = embedded.view(sentence_len, batch_size, -1)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        if use_cuda:
            hidden = hidden.cuda()
        return hidden

In [8]:
class DecoderRNN(nn.Module):
    """
        Model's decoder using RNN.
    """

    def __init__(self, embedding_size, hidden_size, output_size, num_layers=1):
        super(DecoderRNN, self).__init__()

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_vector, hidden):
        output = self.embedding(input_vector).view(1, batch_size, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.log_softmax(self.out(output[0]))
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
        if use_cuda:
            hidden = hidden.cuda()
        return hidden

# Training model

## Using RNN

### Checking the model

In [None]:
encoder_test = EncoderRNN(10, 10, 20, 2)
decoder_test = DecoderRNN(10, 20, 10, 2)
print(encoder_test)
print(decoder_test)

In [None]:
encoder_hidden = encoder_test.init_hidden()
word_input = Variable(torch.LongTensor([1, 2, 3]))
if use_cuda:
    encoder_test.cuda()
    word_input = word_input.cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = Variable(torch.LongTensor([1, 2, 3, 4]))
decoder_hidden = encoder_hidden

if use_cuda:
    decoder_test.cuda()
    word_inputs = word_inputs.cuda()

for i in range(4):
    decoder_output, decoder_hidden = decoder_test(word_inputs[i], decoder_hidden)
    print(decoder_output)
    print(decoder_hidden)

### Define training

In [9]:
teacher_forcing_ratio = 0.0
clip = 5.0
MAX_LENGTH = 40

In [10]:
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    # Zero gradient
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Prepare input for decoder and output variables
    decoder_input = Variable(torch.LongTensor([[en_vocab['<s>']]]))
    decoder_hidden = encoder_hidden  # Use last hidden from the encoder

    if use_cuda:
        decoder_input = decoder_input.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        # Teacher forcing: use the ground-truth target as the next input
        for d_i in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output[0], target_variable[d_i])
            decoder_input = target_variable[d_i]
    else:
        # Without teacher forcing use its own predictions as the next input
        for d_i in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
#             print(decoder_output)
            loss += criterion(decoder_output, target_variable[d_i])
            # Pick most likely word index (highest value) from output (greedy search)
            top_value, top_index = decoder_output.data.topk(1)
            n_i = top_index[0][0]
#             print(n_i)
#             print(torch.LongTensor([n_i]))
            decoder_input = Variable(torch.LongTensor([[n_i]])) # Chosen word is next input
            
            if use_cuda:
                decoder_input = decoder_input.cuda()

            # Stop at end of sentence (not necessary when using known targers)
            if n_i == en_vocab['</s>']:
                break
    # Backpropagation
    loss.backward()
    nn.utils.clip_grad_norm(encoder.parameters(), clip)
    nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length

### Run training

In [11]:
embedding_size = 200
hidden_size = 250
num_layers = 1
dropout_p = 0.05

# Initialize models
encoder = EncoderRNN(len(en_vocab), embedding_size, hidden_size, num_layers)
decoder = DecoderRNN(embedding_size, hidden_size, len(de_vocab), num_layers)

# Move models to GPU
if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
# Initialize parameters and criterion
learning_rate = 0.0001
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [None]:
# Configuring training
num_epochs = 1
plot_every = 100
print_every = 100

# Keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0 # Reset every print every
plot_loss_total = 0 # Reset every plot every

In [None]:
start = time.time()
for epoch in range(1, num_epochs + 1):
    indexes = np.arange(0, len(en_train_sentences))
    np.random.shuffle(indexes)
    step = 0
    num_steps = math.ceil(len(en_train_sentences) / batch_size)
    for index in indexes:
#         print(index)
#         print(np.array([en_train_sentences[index]]))
#         print(torch.from_numpy(np.array(en_train_sentences[index])))
#         break
        input_variable = Variable(torch.LongTensor(en_train_sentences[index]).view(-1, 1))
        target_variable = Variable(torch.LongTensor(de_train_sentences[index]).view(-1, 1))
        if use_cuda:
            input_variable = input_variable.cuda()
            target_variable = target_variable.cuda()
        loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
                     decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if step == 0:
            step += 1
            continue
        
        if step % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (utils.time_since(start, step / num_steps), step,
                                                   step / num_steps * 100, print_loss_avg)
            print(print_summary)
        
        if step % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total
        step += 1

0m 12s (- 20m 28s) (100 1%) 5.5692
0m 23s (- 18m 57s) (200 2%) 4.4408
0m 33s (- 18m 7s) (300 3%) 4.2847
0m 45s (- 18m 1s) (400 4%) 4.2927
0m 56s (- 17m 49s) (500 5%) 4.3076
1m 8s (- 17m 51s) (600 6%) 4.3222
1m 19s (- 17m 41s) (700 7%) 4.1791
1m 30s (- 17m 19s) (800 8%) 4.0260
1m 42s (- 17m 17s) (900 9%) 4.2129
1m 54s (- 17m 14s) (1000 10%) 4.0652
2m 6s (- 17m 0s) (1100 11%) 4.2021
2m 16s (- 16m 40s) (1200 12%) 4.2243
2m 28s (- 16m 31s) (1300 13%) 4.2773
2m 39s (- 16m 19s) (1400 14%) 4.2225
2m 50s (- 16m 7s) (1500 15%) 4.2131
3m 2s (- 15m 55s) (1600 16%) 4.1901
3m 13s (- 15m 42s) (1700 17%) 4.1597
3m 25s (- 15m 35s) (1800 18%) 4.1338
3m 36s (- 15m 21s) (1900 19%) 4.1140
3m 47s (- 15m 10s) (2000 20%) 4.1753
3m 59s (- 14m 59s) (2100 21%) 4.1355
4m 9s (- 14m 43s) (2200 22%) 4.0144
4m 18s (- 14m 25s) (2300 23%) 4.0329
4m 30s (- 14m 16s) (2400 24%) 4.0423
4m 41s (- 14m 3s) (2500 25%) 4.1340
4m 52s (- 13m 52s) (2600 26%) 4.1922
5m 2s (- 13m 38s) (2700 27%) 4.2754
5m 16s (- 13m 34s) (2800 28%)