In [101]:
from __future__ import unicode_literals, print_function, division
from vocab import vocab, END_TOKEN, START_TOKEN, PADDING_TOKEN, UNKNOWN_TOKEN
from helpers import readLines
from load_data import load_data_evaluate, load_data_training, getInputSizeAverage
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import json

ENCODER_INPUT_SIZE = 10 # dimensione dell'input dell'encoder (numero di triple tipo-valore-posizione in input)
DECODER_OUTPUT_SIZE = 35 # dimensione dell'output del decoder (lunghezza della frase in output)
BATCH_SIZE = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

base_path = "data"
# device = "cpu"

device: cuda


In [102]:
# from load_data import getInputSizeAverage

# print(getInputSizeAverage())

In [103]:
import time
import math
import datetime

def asMinutes(s):
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)


def timeSince(since, percent):
  now = time.time()
  s = now - since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def asMsecs(s):
  sec = math.floor(s)
  msec = s * 1000
  msec -= sec * 1000
  return '%ds %dms' % (sec, msec)


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
  plt.figure()
  fig, ax = plt.subplots()
  # this locator puts ticks at regular intervals
  loc = ticker.MultipleLocator(base=0.01)
  ax.yaxis.set_major_locator(loc)
  # ax.set_facecolor('pink')
  plt.plot(points)
  plt.show()

def getPlot(points):
  plt.figure()
  fig, ax = plt.subplots()
  # this locator puts ticks at regular intervals
  loc = ticker.MultipleLocator(base=0.01)
  ax.yaxis.set_major_locator(loc)
  # ax.set_facecolor('pink')
  plt.plot(points)
  return fig

def calc_avg_loss(prec_loss, curr_loss, alpha=0.95):
  if prec_loss == 0:
    return curr_loss
  return alpha * prec_loss + (1.0 - alpha) * curr_loss

In [104]:
type_vocab, value_vocab, token_vocab, pairs = load_data_training(
  torch=torch,
  device=device,
  vocab_size=5000,
  batch_size=BATCH_SIZE,
  input_size=ENCODER_INPUT_SIZE,
  output_size=DECODER_OUTPUT_SIZE,
  pair_amount=100,
  path=base_path
)

def split_data(pairs, train_size=0.8):
  train_size = int(train_size * len(pairs))
  train_pairs = pairs[:train_size]
  test_pairs = pairs[train_size:]
  return train_pairs, test_pairs

train_pairs, test_pairs = split_data(pairs)

# dropout percentuale
# criterion se serve logsoftmax

# togliere dropout nel test piccolo
# se funziona tutto attention weights padding

selected 100 pairs out of 130873 available
------------------------
loading data: 10/100 (10%)
loading data: 20/100 (20%)
loading data: 30/100 (30%)
loading data: 40/100 (40%)
loading data: 50/100 (50%)
loading data: 60/100 (60%)
loading data: 70/100 (70%)
loading data: 80/100 (80%)
loading data: 90/100 (90%)
loading data: 100/100 (100%)
------------------------
pairs: 100, total articles: 300
batch size: 3, input size: 10, output size: 35
input shape:  torch.Size([3, 3, 10])
output shape:  torch.Size([3, 35])


# ENCODER

In [105]:
class EncoderRNN(nn.Module):
  def __init__(self, type_vocab, value_vocab, hidden_size, embedding_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.typeEmbedding = nn.Embedding(len(type_vocab), embedding_size, device=device)
    self.valueEmbedding = nn.Embedding(len(value_vocab), embedding_size, device=device)
    self.positionEmbedding = nn.Embedding(ENCODER_INPUT_SIZE, 10, device=device)
    
    self.gru = nn.GRU(embedding_size * 2 + 10, self.hidden_size, device=device)
    self.dropout1 = nn.Dropout(0.1)
    # self.dropout2 = nn.Dropout(0.5)

  def forward(self, inputs, hidden):
    E_type_out = self.typeEmbedding(inputs[0]) # [BATCH, EMBEDDING]
    E_value_out = self.valueEmbedding(inputs[1]) # [BATCH, EMBEDDING]
    E_pos_out = self.positionEmbedding(inputs[2]) # [BATCH, EMBEDDING]

    output = torch.cat((E_type_out, E_pos_out, E_value_out), dim=1).view(1, BATCH_SIZE, -1) # [1, BATCH, EMBEDDING * 2 + 10]

    output = self.dropout1(output) # [1, BATCH, EMBEDDING * 2 + 10]

    output, hidden = self.gru(output, hidden) # [1, BATCH, EMBEDDING * 2 + 10] (entrambi)

    # output = self.dropout2(output)

    return output, hidden

  def initHidden(self):
    return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=device)

# ATTENTION DECODER

In [106]:
class AttnCalc(nn.Module):
  def __init__(self, hidden_size):
    super(AttnCalc, self).__init__()
    self.decoderAttnLinear = nn.Linear(hidden_size, hidden_size)

    self.attnConv = nn.Conv2d(ENCODER_INPUT_SIZE, ENCODER_INPUT_SIZE, (hidden_size, hidden_size), stride=1, padding="same")
    self.cvgConv = nn.Conv2d(ENCODER_INPUT_SIZE, ENCODER_INPUT_SIZE, (1, hidden_size), stride=1, padding="same")

    self.v = nn.Parameter(torch.FloatTensor(BATCH_SIZE, hidden_size), requires_grad=True)

    self.tanhfeatures = nn.Tanh()

  def forward(self, hidden, encoder_outputs, coverage):
    # hidden = [1, BATCH, HIDDEN]
    # encoder_outputs = [ENCODER_INPUT_SIZE, HIDDEN]
    # coverage = [1, ENCODER_INPUT_SIZE]

    encoder_features = encoder_outputs.view(BATCH_SIZE, ENCODER_INPUT_SIZE, 1, -1) # [BATCH_SIZE, ENCODER_INPUT_SIZE, 1, HIDDEN]
    encoder_features = self.attnConv(encoder_features) #- [BATCH_SIZE, ENCODER_INPUT_SIZE, 1, HIDDEN]

    decoder_features = self.decoderAttnLinear(hidden) # [1, BATCH, HIDDEN]
    decoder_features = decoder_features.view(BATCH_SIZE, 1, 1, -1) #- [BATCH, 1, 1, HIDDEN]
    
    coverage_features = coverage.view(BATCH_SIZE, ENCODER_INPUT_SIZE, 1, -1) # [BATCH_SIZE, ENCODER_INPUT_SIZE, 1, 1]
    coverage_features = self.cvgConv(coverage_features) #- [BATCH_SIZE, ENCODER_INPUT_SIZE, 1, 1]

    # [1, ENCODER_INPUT_SIZE, 1, HIDDEN] + [BATCH, 1, 1, HIDDEN] + [1, ENCODER_INPUT_SIZE, 1, 1]
    attn_features = encoder_features + decoder_features + coverage_features # [BATCH, ENCODER_INPUT_SIZE, 1, HIDDEN]
    attn_features = attn_features.view(BATCH_SIZE, ENCODER_INPUT_SIZE, -1) # [BATCH, ENCODER_INPUT_SIZE, HIDDEN]
    attn_features = self.tanhfeatures(attn_features) #- [BATCH, ENCODER_INPUT_SIZE, HIDDEN]

    temp_v = self.v.unsqueeze(2) #- [BATCH, HIDDEN, 1]
    attn_weights = torch.bmm(attn_features, temp_v) # [BATCH, ENCODER_INPUT_SIZE, 1]
    attn_weights = torch.sum(attn_weights, dim=2) #- [BATCH, ENCODER_INPUT_SIZE]

    coverage += attn_weights # [BATCH, ENCODER_INPUT_SIZE]

    context_vector = attn_weights.view(BATCH_SIZE, ENCODER_INPUT_SIZE, 1) * encoder_outputs # [BATCH, ENCODER_INPUT_SIZE, HIDDEN]

    context_vector = torch.sum(context_vector, dim=1) # [BATCH, HIDDEN]

    return context_vector, attn_weights, coverage

In [107]:
class AttnDecoderRNN(nn.Module):
  def __init__(self, output_vocab_size, hidden_size, embedding_size):
    super(AttnDecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.output_vocab_size = output_vocab_size
    self.embedding_size = embedding_size
    self.calcAttn = AttnCalc(hidden_size).to(device)

    self.embedding = nn.Embedding(self.output_vocab_size, embedding_size)

    self.preOut = nn.Linear(self.hidden_size * 2, self.hidden_size)
    self.out = nn.Linear(self.hidden_size, self.output_vocab_size)
    self.newIn = nn.Linear(self.hidden_size + embedding_size, self.hidden_size)

    self.dropout = nn.Dropout(0.1)
    self.gru = nn.GRU(self.hidden_size, self.hidden_size)

    self.tanhout = nn.Tanh()

    self.decoder_times = [0, 0, 0, 0, 0]

  def forward(self, encoder_outputs, input, hidden, coverage, context_vector=None):
    # --------------------------------
    start = time.time()
    embedded = self.embedding(input).view(1, BATCH_SIZE, -1) # [1, BATCH, EMBEDDING]
    embedded = self.dropout(embedded) # [1, BATCH, EMBEDDING]
    embedded = embedded.squeeze(0) # [BATCH, EMBEDDING] NOTE: da chiedere
    self.decoder_times[0] += time.time() - start
    # --------------------------------


    # --------------------------------
    start = time.time()

    # hidden = [1, BATCH, HIDDEN]
    # encoder_outputs = [BATCH, ENCODER_INPUT_SIZE, HIDDEN]
    # coverage = [BATCH, ENCODER_INPUT_SIZE]

    if context_vector is None:
      context_vector, _, _ = self.calcAttn(hidden, encoder_outputs, coverage)
    #context_vector = [BATCH, HIDDEN]

    self.decoder_times[1] += time.time() - start
    # --------------------------------

    # --------------------------------
    start = time.time()
    
    # input -> [BATCH, EMBEDDING + HIDDEN]
    new_input = self.newIn(torch.cat((embedded, context_vector), 1)) #- [BATCH, HIDDEN]

    output, hidden = self.gru(new_input.view(1, BATCH_SIZE, -1), hidden) # [1, BATCH, HIDDEN]
    output = output.squeeze(0) #- [BATCH, HIDDEN]

    self.decoder_times[2] += time.time() - start
    # --------------------------------

    # --------------------------------
    start = time.time()

    context_vector, attn_weights, coverage = self.calcAttn(hidden, encoder_outputs, coverage)
    #- coverage -> [BATCH, ENCODER_INPUT_SIZE]
    #- context_vector -> [BATCH, HIDDEN]
    #- attn_weights -> [BATCH, ENCODER_INPUT_SIZE]

    output = torch.cat((output, context_vector), 1) # [BATCH, HIDDEN * 2]
    output = self.preOut(output) # [BATCH, HIDDEN]
    output = self.tanhout(output) #- [BATCH, HIDDEN] NOTE: da chiedere

    self.decoder_times[3] += time.time() - start
    # --------------------------------

    # output = self.dropout2(output) 0.5

    # --------------------------------
    start = time.time()

    output = self.out(output) # [BATCH, OUTPUT_VOCAB_SIZE]

    output = F.log_softmax(output, dim=1) #- [BATCH, OUTPUT_VOCAB_SIZE]

    self.decoder_times[4] += time.time() - start
    # --------------------------------

    return output, hidden, context_vector, attn_weights, coverage

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

# TRAINING

In [108]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(2) # == PHRASE_SIZE
  target_length = target_tensor.size(1)

  loss = 0

  encoder_outputs = torch.zeros(ENCODER_INPUT_SIZE, BATCH_SIZE, encoder.hidden_size, device=device)

  for i in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[:,:,i], encoder_hidden) # [1, BATCH, HIDDEN]
    encoder_outputs[i] = encoder_output[0]

  encoder_outputs = encoder_outputs.permute(1, 0, 2) # [BATCH, ENCODER_INPUT_SIZE, HIDDEN]

  decoder_input = torch.tensor([type_vocab.getID(START_TOKEN) for _ in range(BATCH_SIZE)], device=device)

  decoder_hidden = encoder_hidden
  coverage = torch.zeros(BATCH_SIZE, ENCODER_INPUT_SIZE, device=device)
  context_vector = None

  start = time.time()

  for di in range(target_length):
    start_cycle = time.time()

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    # use_teacher_forcing = True

    if use_teacher_forcing:
      # Teacher forcing: Feed the target as the next input
      decoder_output, decoder_hidden, context_vector, attn_weights, coverage = decoder(encoder_outputs, decoder_input, decoder_hidden, coverage, context_vector)
      decoder_input = target_tensor[:, di]  # Teacher forcing
        
    else:
      # Without teacher forcing: use its own predictions as the next input
      decoder_output, decoder_hidden, context_vector, attn_weights, coverage = decoder(encoder_outputs, decoder_input, decoder_hidden, coverage, context_vector)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze()

    loss += criterion(decoder_output, target_tensor[:,di])

    print(f"iter time {di}: {asMsecs(time.time() - start_cycle)}")


  # for i in range(len(decoder.decoder_times)):
  #   print(f"avg {i} = {asMsecs(decoder.decoder_times[i] / len(train_pairs))}")
  #   print(f"tot {i} = {asMsecs(decoder.decoder_times[i])}")
  #   print("------------------")

  print(f"train decoder time: {asMsecs(time.time() - start)}")

  raise Exception("stop") # solo per debug
  
  start = time.time()

  loss = loss / target_length
  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  print(f"train backward time: {asMsecs(time.time() - start)}")
  start = time.time()


  return loss.item()

In [109]:
def trainEpoch(encoder, decoder, inputs, print_times=10, plot_times=10000, learning_rate=5e-5):
  start = time.time()
  plot_losses = []
  print_loss_total = 0  # Reset every print_every
  plot_loss_total = 0  # Reset every plot_every
  epoch_len = len(inputs)
  plot_every = max(int(epoch_len / plot_times), 1)
  print_every = max(int(epoch_len / print_times), 1)

  encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
  criterion = nn.NLLLoss()

  for iter in range(1, epoch_len+1):
    # ogni elemento di inputs è una tupla (input, target)
    # ogni valore input è un tensore di dimensione [3, batch, encoder_input_size], deve 3 rappresenta (tipo, valore, posizione)
    # ogni valore target è un tensore di dimensione [batch, decoder_output_size]

    training_pair = inputs[iter-1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]

    start_time = time.time()

    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    print(f"iter {iter} time: {timeSince(start_time, 1)}")

    print_loss_total += loss
    plot_loss_total += loss

    if iter % print_every == 0:
      print_loss_avg = print_loss_total / print_every
      print_loss_total = 0
      print(f"{timeSince(start, iter / epoch_len+1)} ({iter} {iter / (epoch_len+1) * 100:.2f}%) {print_loss_avg:.4f}")

    if iter % print_every == 0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0

  showPlot(plot_losses)
  return getPlot(plot_losses)


# EVALUATION

In [110]:
teacher_forcing_ratio = 0.5

def evaluate(input_tensor, target_tensor, encoder, decoder, criterion):
  encoder_hidden = encoder.initHidden()

  input_length = input_tensor.size(2) # == PHRASE_SIZE
  target_length = target_tensor.size(1)

  loss = 0

  encoder_outputs = torch.zeros(ENCODER_INPUT_SIZE, BATCH_SIZE, encoder.hidden_size, device=device)

  for i in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[:,:,i], encoder_hidden) # [1, BATCH, HIDDEN]
    encoder_outputs[i] = encoder_output[0]

  encoder_outputs = encoder_outputs.permute(1, 0, 2) # [BATCH, ENCODER_INPUT_SIZE, HIDDEN]

  decoder_input = torch.tensor([[type_vocab.getID(START_TOKEN)] for _ in range(BATCH_SIZE)], device=device)
  decoder_input = decoder_input.view(1, BATCH_SIZE)

  decoder_hidden = encoder_hidden
  coverage = torch.zeros(BATCH_SIZE, ENCODER_INPUT_SIZE, device=device)
  context_vector = None

  decoder_outputs = []

  for di in range(target_length):
    decoder_output, decoder_hidden, context_vector, attn_weights, coverage = decoder(encoder_outputs, decoder_input, decoder_hidden, coverage, context_vector)
    topv, topi = decoder_output.topk(1)
    decoder_input = topi

    loss += criterion(decoder_output, target_tensor[:,di])
    decoder_outputs.append(decoder_output)

  loss = loss / target_length

  return loss.item(), decoder_outputs

In [111]:
def evaluateEpoch(encoder, decoder, inputs):
  start = time.time()
  plot_losses = []
  loss_total = 0  # Reset every print_every
  epoch_len = len(inputs)

  criterion = nn.NLLLoss()

  sample_start = None
  sample_end = None

  for iter in range(1, epoch_len + 1):
    training_pair = inputs[iter-1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]

    loss, decoder_outputs = evaluate(input_tensor, target_tensor, encoder, decoder, criterion)
    loss_total += loss

    if iter == 0:
      sample_start = [decoder_outputs, target_tensor]
    if iter == epoch_len - 1:
      sample_end = [decoder_outputs, target_tensor]
  
  avg_loss = loss_total / epoch_len
  return avg_loss, sample_start, sample_end


# TESTING

In [112]:

encoder = EncoderRNN(type_vocab, value_vocab, 256, 128).to(device)
decoder = AttnDecoderRNN(len(token_vocab), 256, 128).to(device)

# decoder = torch.load("saved_models/decoder_24.08_14.55-20000-iters.pt", map_location=device)
# encoder = torch.load("saved_models/encoder_24.08_14.55-20000-iters.pt", map_location=device)

done_epochs = 0

In [113]:
from datetime import datetime
from math import ceil
import os

EPOCHS = 1
FLAT = 3

PLOT_TIMES = 1000
PRINT_TIMES = 5
BATCH_PRINT_SIZE = 5
SAVE_MODEL_EVERY = 1
SAVE_PLOT_EVERY = 1

start_time = str(datetime.now().strftime("%d.%m_%H.%M"))
output_file = f"{base_path}/output/out-{start_time}.txt"

# with open(output_file, 'w', encoding='utf-8') as outfile: pass

prec_loss = 0

def saveModel(encoder, decoder, epoch):
  torch.save(encoder, f"{base_path}/models/encoder_{start_time}-ep_{epoch}-iters.pt")
  torch.save(decoder, f"{base_path}/models/decoder_{start_time}-ep_{epoch}-iters.pt")

def savePlot(plot, epoch):
  plot.savefig(f"{base_path}/plots/plot_{start_time}-ep_{epoch}-iters.png")

def saveOutput(output, target, epoch):
  with open(output_file, 'a', encoding='utf-8') as outfile:
    for i in range(min(BATCH_PRINT_SIZE, len(output))):
      outfile.write("----------------------\n")
      predict = ""
      target = ""
      for word in output[i]:
        predict += str(word) + " "
      outfile.write(predict + "\n")
      # outfile.write("-.... ↑|predict|↑ ....... ↓|target|↓ ....-\n")
      # for word in pairs[0][1][i]:
      #   target += token_vocab.getWord(word.item()) + " "
      # outfile.write(target + "\n")
    

for epoch in range(1, EPOCHS+1):
  print(f"----========= EPOCH {epoch}/{EPOCHS}=========----")
  epoch_start = time.time()
  
  random.shuffle(pairs)
  plot = trainEpoch(encoder, decoder, train_pairs, print_times=PRINT_TIMES, plot_times=PLOT_TIMES)
  print(f"------------------- Trained -------------------")
  curr_loss, sample_start, sample_end = evaluateEpoch(encoder, decoder, test_pairs)

  temp_loss = calc_avg_loss(prec_loss, curr_loss)
  if(prec_loss < temp_loss):
    FLAT -= 1
  else:
    FLAT = 3
  
  if(FLAT == 0):
    break
  
  prec_loss = temp_loss

  print(f"------------------- Finished epoch -------------------")
  print(f"time: {int((time.time() - epoch_start)/60)}min")
  print(f"loss: {curr_loss}, avg loss: {temp_loss}, flat: {FLAT}")

  # saveOutput(sample_start[0], sample_start[1], epoch)
  # saveOutput(sample_end[0], sample_end[1], epoch)

  # if epoch % SAVE_PLOT_EVERY == 0:
  #   savePlot(plot, epoch, epoch)

  # if epoch % SAVE_MODEL_EVERY == 0:
  #   saveModel(encoder, decoder, epoch, epoch)


  

iter time 0: 0s 58ms
iter time 1: 0s 1ms
iter time 2: 0s 1ms
iter time 3: 0s 1ms
iter time 4: 0s 0ms
iter time 5: 0s 1ms
iter time 6: 0s 1ms
iter time 7: 0s 0ms
iter time 8: 0s 3ms
iter time 9: 0s 1ms
iter time 10: 0s 1ms
iter time 11: 0s 0ms
iter time 12: 0s 1ms
iter time 13: 0s 0ms
iter time 14: 0s 1ms
iter time 15: 0s 1ms
iter time 16: 0s 1ms
iter time 17: 0s 1ms
iter time 18: 0s 1ms
iter time 19: 0s 0ms
iter time 20: 0s 1ms
iter time 21: 0s 1ms
iter time 22: 0s 2ms
iter time 23: 0s 1ms
iter time 24: 0s 4ms
iter time 25: 0s 2ms
iter time 26: 0s 2ms
iter time 27: 0s 1ms
iter time 28: 0s 1ms
iter time 29: 0s 16ms
iter time 30: 0s 46ms
iter time 31: 0s 32ms
iter time 32: 0s 44ms
iter time 33: 0s 43ms
iter time 34: 0s 30ms
train decoder time: 0s 332ms


Exception: stop

In [None]:
# encoder = torch.load("saved_models/encoder_100000-iters.pt")
# decoder = torch.load("saved_models/decoder_100000-iters.pt")

# test_input = pairs[0][0]

# outputs, _ = evaluate(encoder, decoder, test_input)
# for i in range(len(outputs)):
#   print("----------------------")
#   predict = ""
#   target = ""
#   for word in outputs[i]:
#     predict += str(word) + " "
#   print(predict)
#   print("-.... ↑|predict|↑ ....... ↓|target|↓ ....-")
#   for word in pairs[0][1][i]:
#     target += token_vocab.getWord(word.item()) + " "
#   print(target)
