# MED: Morphological Encoder-Decoder

In [1]:
import numpy as np
import time
import os.path

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence#, masked_cross_entropy
from masked_cross_entropy import *

from med_dataset import MEDDataset, med_collate_fn
from torch.utils.data import DataLoader

USE_CUDA = torch.cuda.is_available()
PAD_TOKEN = 0
START_TOKEN = 1
END_TOKEN = 2
UNK_TOKEN = 3

## The Encoder

In [2]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=PAD_TOKEN)
        self.gru = nn.GRU(embed_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        # Note: we run this all at once (over multiple batches of multiple sequences)
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
        return outputs, hidden

## The Decoder

### The Attention Module

In [3]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

        if USE_CUDA:
            attn_energies = attn_energies.cuda()

        # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy

### Bahdanau et al. model

In [4]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(BahdanauAttnDecoderRNN, self).__init__()
        
        # Define parameters
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        # Define layers
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=PAD_TOKEN)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn('concat', hidden_size)
        self.gru = nn.GRU(embed_size, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, word_input, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        # TODO: FIX BATCHING
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        word_embedded = self.dropout(word_embedded)
        
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        context = context.transpose(0, 1) # 1 x B x N
        
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((word_embedded, context), 2)
        output, hidden = self.gru(rnn_input, last_hidden)
        
        # Final output layer
        output = output.squeeze(0) # B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

### Luong et al. model

In [5]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=PAD_TOKEN)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N

        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

## Training

In [6]:
def train_step(src_batch, src_lens, trg_batch, trg_lens, encoder, decoder, 
               encoder_optimizer, decoder_optimizer, criterion):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    #loss = 0 # Added onto for each word

    # Run words through encoder
    encoder_outputs, encoder_hidden = encoder(src_batch, src_lens, None)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([START_TOKEN] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    max_trg_len = max(trg_lens)
    all_decoder_outputs = Variable(torch.zeros(max_trg_len, batch_size, decoder.output_size))

    # Move new Variables to CUDA
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()

    # Run through decoder one time step at a time
    for t in range(max_trg_len):
        decoder_output, decoder_hidden, decoder_attn = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        all_decoder_outputs[t] = decoder_output
        decoder_input = trg_batch[t] # Next input is current target

    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        trg_batch.transpose(0, 1).contiguous(), # -> batch x seq
        trg_lens
    )
    loss.backward()
    
    # Clip gradient norms
    enc_grads = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dec_grads = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0]#, enc_grads, dec_grads

In [7]:
def save_checkpoint(encoder, decoder, checkpoint_dir):
    enc_filename = "{}/enc-{}.pth".format(checkpoint_dir, time.strftime("%d%m%y-%H%M%S"))
    dec_filename = "{}/dec-{}.pth".format(checkpoint_dir, time.strftime("%d%m%y-%H%M%S"))
    #if not os.path.isfile(enc_filename):
    #    open(enc_filename, 'w+')
    #if not os.path.isfile(dec_filename):
    #    open(dec_filename, 'w+')
    torch.save(encoder.state_dict(), enc_filename)
    torch.save(decoder.state_dict(), dec_filename)
    print("Model saved.")

def train(dataset, batch_size, n_epochs, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          checkpoint_dir=None, save_every=500):
    train_iter = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=4,
                            collate_fn=med_collate_fn)
    for i in range(n_epochs):
        print("Epoch {}/{}".format(i+1, n_epochs))
        for batch_idx, batch in enumerate(train_iter):
            input_batch, input_lengths, target_batch, target_lengths = batch
            loss = train_step(input_batch, input_lengths, target_batch, target_lengths, 
                                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
            if batch_idx % 100 == 0:
                print("batch: {}, loss: {}".format(batch_idx, loss))
            if checkpoint_dir:
                if batch_idx % save_every == 0:
                    save_checkpoint(encoder, decoder, checkpoint_dir)
    
    if checkpoint_dir:
        save_checkpoint(encoder, decoder, checkpoint_dir)

## Configuring and Initializing Models

In [8]:
# Configure models
attn_model = 'dot'
hidden_size = 100
embed_size = 300
n_layers = 1
dropout = 0.1
batch_size = 20
checkpoint_dir = "checkpoints"

# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_epochs = 20

In [9]:
# Initialize dataset
dataset = MEDDataset("data/german-task2-train")

# Initialize models
encoder = EncoderRNN(len(dataset.in_vocab[0]), embed_size, hidden_size, n_layers, dropout=dropout)
decoder = LuongAttnDecoderRNN(attn_model, hidden_size, len(dataset.out_vocab[0]), n_layers, dropout=dropout)

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss()

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

In [11]:
#encoder.load_state_dict(torch.load("checkpoints/enc-150518-224957.pth"))
#decoder.load_state_dict(torch.load("checkpoints/dec-150518-224957.pth"))

train(dataset, 
      batch_size, 
      n_epochs, 
      encoder, 
      decoder, 
      encoder_optimizer, 
      decoder_optimizer, 
      criterion, 
      checkpoint_dir)

Epoch 1/20


  log_probs_flat = functional.log_softmax(logits_flat)
  seq_range = torch.range(0, max_len - 1).long()


batch: 0, loss: 2.3484816551208496
Model saved.
batch: 100, loss: 1.8132071495056152
batch: 200, loss: 0.9923098683357239
batch: 300, loss: 0.6644088625907898
batch: 400, loss: 0.32585662603378296
batch: 500, loss: 0.304109662771225
Model saved.
batch: 600, loss: 0.28833991289138794
batch: 700, loss: 0.2485450953245163
batch: 800, loss: 0.18855859339237213
batch: 900, loss: 0.22373977303504944
batch: 1000, loss: 0.1716632843017578
Model saved.
batch: 1100, loss: 0.32588452100753784
batch: 1200, loss: 0.23767510056495667
Epoch 2/20
batch: 0, loss: 0.1861034333705902
Model saved.
batch: 100, loss: 0.17188267409801483
batch: 200, loss: 0.24478620290756226
batch: 300, loss: 0.3629983961582184
batch: 400, loss: 0.1534886360168457
batch: 500, loss: 0.2529307007789612
Model saved.
batch: 600, loss: 0.21300119161605835
batch: 700, loss: 0.15426452457904816
batch: 800, loss: 0.21426984667778015
batch: 900, loss: 0.2350471466779709
batch: 1000, loss: 0.14383719861507416
Model saved.
batch: 1100,

Process Process-55:
Process Process-54:
Process Process-56:
Process Process-53:
Traceback (most recent call last):
  File "/home/tome/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/tome/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/tome/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
    r = index_queue.get()
  File "/home/tome/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/tome/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/tome/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter

KeyboardInterrupt: 

## Translating

In [13]:
def translate(input_seq, input_len, encoder, decoder, max_length=40):
    #input_lengths = [len(input_seq)]
    #input_seqs = [indexes_from_sentence(input_lang, input_seq)]
    #input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    input_seq = Variable(input_seq, volatile=True)
    
    if USE_CUDA:
        input_seq = input_seq.cuda()
        
    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_seq, input_len, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([START_TOKEN]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_chars = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)
    
    # Run through decoder
    for t in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        #decoder_attentions[t,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        prob, token_idx = decoder_output.data.topk(1)
        tok = token_idx[0][0]
        if tok == END_TOKEN:
            break
        else:
            decoded_chars.append(dataset.out_vocab[0][tok])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([tok]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)
    
    return "".join(decoded_chars), decoder_attentions[:t+1, :len(encoder_outputs)]

In [14]:
def translate_dataset(file_name, encoder, decoder):
    dataset = MEDDataset(file_name, train=False)
    test_iter = DataLoader(dataset=dataset,
                           batch_size=1,
                            shuffle=False,
                            num_workers=4,
                            collate_fn=med_collate_fn)
    
    decoded_words = []
    for input_seq, input_len, _, _ in test_iter:
        decoded_word, attentions = translate(input_seq, input_len, encoder, decoder)
        decoded_words.append(decoded_word)
        
    with open(file_name + "-results", 'w') as outfile:
        outfile.write("\n".join(decoded_words))

In [15]:
translate_dataset("data/german-task2-test", encoder, decoder)

