In [1]:
from __future__ import unicode_literals, print_function, division
from vocab import vocab, END_TOKEN, START_TOKEN, PADDING_TOKEN, UNKNOWN_TOKEN
from helpers import readLines
from load_data import load_data
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import json

PHRASE_SIZE = 100
BATCH_SIZE = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

# ENCODER

In [2]:
class EncoderRNN(nn.Module):
  def __init__(self, type_vocab, value_vocab, hidden_size, embedding_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.typeEmbedding = nn.Embedding(len(type_vocab), embedding_size, device=device)
    self.valueEmbedding = nn.Embedding(len(value_vocab), embedding_size, device=device)
    self.positionEmbedding = nn.Embedding(PHRASE_SIZE, 10, device=device)
    
    self.gru = nn.GRU(embedding_size * 2 + 10, self.hidden_size)

  def forward(self, inputs, hidden):
    E_type_out = self.typeEmbedding(inputs[0])
    E_value_out = self.valueEmbedding(inputs[1])
    E_pos_out = self.positionEmbedding(inputs[2])

    output = torch.cat((E_type_out, E_pos_out, E_value_out), dim=1).view(1, BATCH_SIZE, -1)

    output, hidden = self.gru(output, hidden)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device)

# DECODER

In [3]:
class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size, hidden_size, device=device)
    
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    output = self.embedding(input)

    output = F.relu(output)
    
    output, hidden = self.gru(output, hidden)
    output = self.softmax(self.out(output[0]))
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device)

# TIMING

In [4]:
import time
import math


def asMinutes(s):
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)


def timeSince(since, percent):
  now = time.time()
  s = now - since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# INPUT

In [5]:
type_vocab, value_vocab, token_vocab, pairs = load_data(torch, device, 50000, BATCH_SIZE, PHRASE_SIZE, 10000)

selected 10000 pairs out of 39261 available
------------------------
loading data: 1000/10000 (10.0%)
loading data: 2000/10000 (20.0%)
loading data: 3000/10000 (30.0%)
loading data: 4000/10000 (40.0%)
loading data: 5000/10000 (50.0%)
loading data: 6000/10000 (60.0%)
loading data: 7000/10000 (70.0%)
loading data: 8000/10000 (80.0%)
loading data: 9000/10000 (90.0%)
loading data: 10000/10000 (100.0%)
------------------------
pairs:  10000
batch size: 10, phrase size: 100
input shape:  torch.Size([3, 10, 100])
output shape:  torch.Size([10, 100])


# TRAINING

In [6]:
teacher_forcing_ratio = 0.5

# NOTE: inputs are (1, BATCH_SIZE)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(2) # == PHRASE_SIZE
  target_length = target_tensor.size(1)

  # NOTE: attention
  # encoder_outputs = torch.zeros(PHRASE_SIZE, encoder.hidden_size, device=device)

  loss = 0

  for i in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[:,:,i], encoder_hidden)
    # NOTE: attention
    # encoder_outputs[i] = encoder_output[0, 0]

  decoder_input = torch.tensor([[type_vocab.getID(START_TOKEN)] for _ in range(BATCH_SIZE)], device=device)
  decoder_input = decoder_input.view(1, BATCH_SIZE)

  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
  # use_teacher_forcing = True

  if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
      # NOTE: attention
      # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      loss += criterion(decoder_output, target_tensor[:,di])
      decoder_input = target_tensor[:, di].view(1, BATCH_SIZE)  # Teacher forcing
      
  else:
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
      # NOTE: attention
      # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach().view(1, BATCH_SIZE)  # detach from history as input

      loss += criterion(decoder_output, target_tensor[:,di])
      
      # FIXME
      # if decoder_input.item() == END_TOKEN:
      #   break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length

In [7]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [random.choice(pairs) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0


In [8]:

encoder = EncoderRNN(type_vocab, value_vocab, 256, 128).to(device)
decoder = DecoderRNN(256, len(token_vocab)).to(device)

trainIters(encoder, decoder, 200, print_every=10, plot_every=1)

0m 5s (- 1m 53s) (10 5%) 26.0331


KeyboardInterrupt: 