In [233]:
from __future__ import unicode_literals, print_function, division
from vocab import vocab, END_TOKEN, START_TOKEN, PADDING_TOKEN, UNKNOWN_TOKEN
from helpers import readLines

type_vocab = vocab('data/counts/types.txt')
value_vocab = vocab('data/counts/values.txt')
token_vocab = vocab('data/counts/tokens.txt')

PHRASE_SIZE = 30
BATCH_SIZE = 5

In [234]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import json

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

# ENCODER

In [235]:
class EncoderRNN(nn.Module):
  def __init__(self, type_vocab, value_vocab, hidden_size, embedding_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.typeEmbedding = nn.Embedding(len(type_vocab), embedding_size, device=device)
    self.valueEmbedding = nn.Embedding(len(value_vocab), embedding_size, device=device)
    self.positionEmbedding = nn.Embedding(PHRASE_SIZE, 10, device=device)
    
    self.gru = nn.GRU(embedding_size * 2 + 10, self.hidden_size)

  def forward(self, inputs, hidden):
    E_type_out = self.typeEmbedding(inputs[0])
    E_value_out = self.valueEmbedding(inputs[1])
    E_pos_out = self.positionEmbedding(inputs[2])

    output = torch.cat((E_type_out, E_pos_out, E_value_out), dim=1).view(1, BATCH_SIZE, -1)

    output, hidden = self.gru(output, hidden)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device)

# DECODER

In [236]:
class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    output = self.embedding(input)
    output = F.relu(output)
    output, hidden = self.gru(output, hidden)
    output = self.softmax(self.out(output[0]))
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

# TRAINING

In [237]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)

  # attention
  # encoder_outputs = torch.zeros(PHRASE_SIZE, encoder.hidden_size, device=device)

  loss = 0

  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    # attention
    # encoder_outputs[ei] = encoder_output[0, 0]

  decoder_input = torch.tensor([[type_vocab.getID(START_TOKEN)] * BATCH_SIZE], device=device)

  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
      # attention
      # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      loss += criterion(decoder_output, target_tensor[di])
      decoder_input = target_tensor[di]  # Teacher forcing

  else:
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
      # attention
      # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()  # detach from history as input

      loss += criterion(decoder_output, target_tensor[di])
      if decoder_input.item() == END_TOKEN:
        break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length

# TIMING

In [238]:
import time
import math


def asMinutes(s):
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)


def timeSince(since, percent):
  now = time.time()
  s = now - since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# INPUT

In [254]:
# TODO: add infobox

articles_tables = {}
articles_text = {}
pairs = {}

POSITIONS = [i for i in range(PHRASE_SIZE)]

print('Loading tables...')

for line in readLines('data/clean/wikidata.json', -1):
  json_line = json.loads(line)
  KEY = json_line['article']
  # line = {'article': TEXT, 'data': {'type': [values, ...]}}
  
  VALUE = ( [], [] )
  i = 0
  for type in json_line['data']:
    for value in json_line['data'][type]:
      if i >= PHRASE_SIZE:
        break
      VALUE[0].append(type_vocab.getID(type))
      VALUE[1].append(value_vocab.getID(value))
      i += 1

  while(len(VALUE[0]) < PHRASE_SIZE):
    VALUE[0].append(type_vocab.getID(PADDING_TOKEN))
    VALUE[1].append(value_vocab.getID(PADDING_TOKEN))

  articles_tables[KEY] = (VALUE[0], VALUE[1], POSITIONS)

print('Loading articles...')

for line in readLines('data/clean/article.json', -1):
  json_line = json.loads(line)
  KEY = json_line['article']
  articles_text[KEY] = [token_vocab.getID(x) for x in json_line['data'][0:PHRASE_SIZE]]
  # articles_text[KEY] = articles_text[KEY]

  while(len(articles_text[KEY]) < PHRASE_SIZE):
    articles_text[KEY].append(type_vocab.getID(PADDING_TOKEN))

print("tables: ", len(articles_tables))
print("articles: ", len(articles_text))

Loading tables...
Loading articles...
883980
394175


In [255]:
batch = []
pairs = []

for article in articles_tables:
  if(article in articles_text):
    a = (articles_tables[article], articles_text[article])
    batch.append(a)

  if(len(batch) == BATCH_SIZE):
    tables = [b[0] for b in batch]
    texts = [b[1] for b in batch]

    input_tensor = torch.tensor([
      # types:
      [[t for t in table[0]] for table in tables],
      # valus:
      [[t for t in table[1]] for table in tables],
      # positions:
      [[t for t in table[2]] for table in tables]
    ])

    target_tensor = torch.tensor([[t for t in text] for text in texts])

    pairs.append((input_tensor, target_tensor))
    batch = []

print(len(pairs))

78523


# TRAIN ITERATIONS

In [241]:
# pair = [[[type, value], [type, value]], [text, text, text, text]]
def tensorsFromPair(pair):
  input_tensor = torch.tensor(pair[0][0:PHRASE_SIZE], device=device)
  target_tensor = torch.tensor(pair[1][0:PHRASE_SIZE], device=device)

  print(input_tensor.shape)
  print(target_tensor.shape)

  return (input_tensor, target_tensor)


def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    pairs = buildPairs(type_vocab, value_vocab, text_vocab, 10000)

    print(len(pairs))

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

trainIters(encoder, decoder, 1, print_every=1, plot_every=1)

NameError: name 'encoder' is not defined