In [178]:
from __future__ import unicode_literals, print_function, division
from vocab import vocab, END_TOKEN, START_TOKEN, PADDING_TOKEN, UNKNOWN_TOKEN
from helpers import readLines

type_vocab = vocab('data/counts/types.txt')
value_vocab = vocab('data/counts/values.txt')
token_vocab = vocab('data/counts/tokens.txt')

PHRASE_SIZE = 100
BATCH_SIZE = 10

In [167]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

# ENCODER

In [188]:
class EncoderRNN(nn.Module):
  def __init__(self, type_vocab, value_vocab, hidden_size, embedding_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.typeEmbedding = nn.Embedding(len(type_vocab), embedding_size, device=device)
    self.valueEmbedding = nn.Embedding(len(value_vocab), embedding_size, device=device)
    self.positionEmbedding = nn.Embedding(PHRASE_SIZE, 10, device=device)
    
    self.gru = nn.GRU(embedding_size * 2 + 10, self.hidden_size)

  def forward(self, inputs, hidden):
    E_type_out = self.typeEmbedding(inputs[0])
    E_value_out = self.valueEmbedding(inputs[1])
    E_pos_out = self.positionEmbedding(inputs[2])

    output = torch.cat((E_type_out, E_pos_out, E_value_out), dim=1).view(1, BATCH_SIZE, -1)

    output, hidden = self.gru(output, hidden)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device)

# DECODER

In [189]:
class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size, hidden_size, device=device)
    
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    output = self.embedding(input)

    output = F.relu(output)
    
    output, hidden = self.gru(output, hidden)
    output = self.softmax(self.out(output[0]))
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device)

# TIMING

In [170]:
import time
import math


def asMinutes(s):
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)


def timeSince(since, percent):
  now = time.time()
  s = now - since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# INPUT

In [185]:
# TODO: add infobox

articles_tables = {}
articles_text = {}
pairs = {}

POSITIONS = [i for i in range(PHRASE_SIZE)]

print('Loading tables...')

for line in readLines('data/clean/wikidata.json', -1):
  json_line = json.loads(line)
  KEY = json_line['article']
  # line = {'article': TEXT, 'data': {'type': [values, ...]}}
  
  VALUE = ( [], [] )
  i = 0
  for type in json_line['data']:
    for value in json_line['data'][type]:
      if i >= PHRASE_SIZE:
        break
      VALUE[0].append(type_vocab.getID(type))
      VALUE[1].append(value_vocab.getID(value))
      i += 1

  while(len(VALUE[0]) < PHRASE_SIZE):
    VALUE[0].append(type_vocab.getID(PADDING_TOKEN))
    VALUE[1].append(value_vocab.getID(PADDING_TOKEN))

  articles_tables[KEY] = (VALUE[0], VALUE[1], POSITIONS)

print('Loading articles...')

for line in readLines('data/clean/article.json', -1):
  json_line = json.loads(line)
  KEY = json_line['article']
  articles_text[KEY] = [token_vocab.getID(x) for x in json_line['data'][0:PHRASE_SIZE]]
  # articles_text[KEY] = articles_text[KEY]

  while(len(articles_text[KEY]) < PHRASE_SIZE):
    articles_text[KEY].append(type_vocab.getID(PADDING_TOKEN))

print("tables: ", len(articles_tables))
print("articles: ", len(articles_text))

Loading tables...
Loading articles...
tables:  883980
articles:  394175


In [186]:
batch = []
pairs = []

for article in articles_tables:
  if(article in articles_text):
    a = (articles_tables[article], articles_text[article])
    batch.append(a)

  if(len(batch) == BATCH_SIZE):
    tables = [b[0] for b in batch]
    texts = [b[1] for b in batch]

    input_tensor = torch.tensor([
      # types:
      [[t for t in table[0]] for table in tables],
      # valus:
      [[t for t in table[1]] for table in tables],
      # positions:
      [[t for t in table[2]] for table in tables]
    ], device=device)

    target_tensor = torch.tensor([[t for t in text] for text in texts], device=device)

    pairs.append((input_tensor, target_tensor))
    batch = []

print("pairs: ", len(pairs))
print(f"batch size: {BATCH_SIZE}, phrase size: {PHRASE_SIZE}")
print("input shape: ", pairs[0][0].shape)
print("output shape: ", pairs[0][1].shape)


pairs:  39261
batch size: 10, phrase size: 100
input shape:  torch.Size([3, 10, 100])
output shape:  torch.Size([10, 100])


# TRAINING

In [173]:
teacher_forcing_ratio = 0.5

# NOTE: inputs are (1, BATCH_SIZE)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(2) # == PHRASE_SIZE
  target_length = target_tensor.size(1)

  # NOTE: attention
  # encoder_outputs = torch.zeros(PHRASE_SIZE, encoder.hidden_size, device=device)

  loss = 0

  for i in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[:,:,i], encoder_hidden)
    # NOTE: attention
    # encoder_outputs[i] = encoder_output[0, 0]

  decoder_input = torch.tensor([[type_vocab.getID(START_TOKEN)] for _ in range(BATCH_SIZE)], device=device)
  decoder_input = decoder_input.view(1, BATCH_SIZE)

  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
  # use_teacher_forcing = True

  if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
      # NOTE: attention
      # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      loss += criterion(decoder_output, target_tensor[:,di])
      decoder_input = target_tensor[:, di].view(1, BATCH_SIZE)  # Teacher forcing
      
  else:
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
      # NOTE: attention
      # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach().view(1, BATCH_SIZE)  # detach from history as input

      loss += criterion(decoder_output, target_tensor[:,di])
      
      # FIXME
      # if decoder_input.item() == END_TOKEN:
      #   break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length

In [174]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [random.choice(pairs) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0


In [190]:

encoder = EncoderRNN(type_vocab, value_vocab, 256, 128).to(device)
decoder = DecoderRNN(256, len(token_vocab)).to(device)

trainIters(encoder, decoder, 200, print_every=10, plot_every=1)

0m 3s (- 1m 11s) (10 5%) 9.6993
0m 7s (- 1m 7s) (20 10%) 16.3831
0m 11s (- 1m 3s) (30 15%) 20.3025
0m 14s (- 0m 59s) (40 20%) 18.3822
0m 18s (- 0m 55s) (50 25%) 16.6830
0m 22s (- 0m 51s) (60 30%) 19.9878
0m 25s (- 0m 47s) (70 35%) 22.6252
0m 29s (- 0m 44s) (80 40%) 21.8464
0m 33s (- 0m 40s) (90 45%) 21.3387
0m 36s (- 0m 36s) (100 50%) 20.0559
0m 40s (- 0m 33s) (110 55%) 18.6143
0m 44s (- 0m 29s) (120 60%) 18.4161
0m 47s (- 0m 25s) (130 65%) 18.0640
0m 51s (- 0m 22s) (140 70%) 19.8498
0m 55s (- 0m 18s) (150 75%) 16.6035
0m 58s (- 0m 14s) (160 80%) 16.0977
1m 2s (- 0m 10s) (170 85%) 21.4741
1m 5s (- 0m 7s) (180 90%) 18.3839
1m 9s (- 0m 3s) (190 95%) 20.0949
1m 13s (- 0m 0s) (200 100%) 23.2726
