In [2]:
import re
import os
import argparse
import shutil
import numpy as np

import torch
import torch.nn.functional as F
from torch.autograd import Variable

from data_utils import *

In [3]:
rnn = torch.nn.LSTM(9, 5, num_layers=3, bidirectional=False, batch_first=True)

input = Variable(torch.randn(128, 3, 9))
h0 = Variable(torch.randn(3, 128, 5))
c0 = Variable(torch.randn(3, 128, 5))
output, hn = rnn(input, (h0, c0))
hh, cc = hn
print hh.size()
print cc.size()
print hh[-1].size()
print cc[-1].size()
print output.size()

torch.Size([3, 128, 5])
torch.Size([3, 128, 5])
torch.Size([128, 5])
torch.Size([128, 5])
torch.Size([128, 3, 5])


In [8]:
class AttentionBahdanau(torch.nn.Module):

    def __init__(self):
        super(AttentionBahdanau, self).__init__()        
        
    def forward(self, last_dehy, enhy):
        dehy_new = last_dehy.unsqueeze(2)

        attn = torch.bmm(enhy, dehy_new).squeeze(2)
        attn = F.softmax(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))
        h_attn = torch.bmm(attn2, enhy).squeeze(1)
        h_attn = F.tanh(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionBahdanau().cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionBahdanau (
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
 0.4004  0.6618  0.6373  ...   0.4453  0.5995  0.3678
 0.6174  0.6409  0.5982  ...   0.5769  0.4256  0.5914
 0.4224  0.4784  0.3986  ...   0.4579  0.5785  0.3653
          ...             ⋱             ...          
 0.5179  0.2139  0.4488  ...   0.5980  0.4463  0.3356
 0.3713  0.3683  0.5257  ...   0.4118  0.3571  0.5281
 0.4696  0.3809  0.3432  ...   0.3952  0.4139  0.6197
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0056  0.6323  0.0123  ...   0.0014  0.0013  0.0554
 0.0110  0.0023  0.0174  ...   0.1912  0.0134  0.1656
 0.0120  0.0605  0.0083  ...   0.1307  0.0048  0.2860
          ...             ⋱             ...          
 0.0001  0.0016  0.0399  ...   0.0000  0.4289  0.1417
 0.2288  0.0130  0.0445  ...   0.0103  0.2388  0.0874
 0.1726  0.0176  0.0039  ...   0.0044  0.0164  0.0126
[torch.cuda.FloatTensor of size 128x16 (GPU 0)]



In [30]:
class LSTMAttentionBahdanau(torch.nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        batch_first=True
    ):
        super(LSTMAttentionBahdanau, self).__init__()
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layer = num_layers
        self.batch_first = batch_first
        
        self.lstm_input_vec = torch.nn.Linear(
            self.input_size+self.hidden_size+self.hidden_size,
            self.hidden_size*4
        ).cuda()
        
        self.attn_layer = AttentionBahdanau().cuda()
        
    def forward(self, input_, hidden_, encoder_hy):
        # user defined lstm with attention
        def lstm_attn(input_, hidden_, encoder_hy):
            hx, cx = hidden_
            h_attn, attn = self.attn_layer(hx, encoder_hy.transpose(0,1))
            
            h_input = torch.cat((input_, hx), 1)
            h_input = torch.cat((h_input, h_attn), 1)
            gates = self.lstm_input_vec(h_input)
            ingate, cellgate, forgetgate, outgate = gates.chunk(4,1)
            
            ingate = torch.nn.Sigmoid()(ingate)
            forgetgate = torch.nn.Sigmoid()(forgetgate)
            outgate = torch.nn.Sigmoid()(outgate)
            cellgate = torch.nn.Tanh()(cellgate)
            
            cy = forgetgate*cx + ingate*cellgate
            hy = outgate*F.tanh(cy)
            
            return hy, cy
        
        if self.batch_first:
            input_ = input_.transpose(0,1)
            
        output_ = []
        for k in range(input_.size(0)):
            hidden_ = lstm_attn(input_[k], hidden_, encoder_hy)
            output_.append(hidden_[0])
        
        len_seq = input_.size(0)
        batch_size, hidden_size = output_[0].size()
        output_ = torch.cat(output_, 0).view(
            len_seq, 
            batch_size, 
            hidden_size
        )
        
        if self.batch_first:
            output_ = output_.transpose(0,1)
            
        return output_, hidden_
    
rd1 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
hd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
cd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
hid2 = (hd2, cd2)
ctx = Variable(torch.rand([16, 128, 50])).cuda()
model = LSTMAttentionBahdanau(
    input_size=100, 
    hidden_size=50,
    num_layers=1, 
    batch_first=True
).cuda()
print model
aa, bb = model(rd1, hid2, ctx)
print aa.size()
print bb[0].size()
print bb[1].size()

LSTMAttentionBahdanau (
  (lstm_input_vec): Linear (200 -> 200)
  (attn_layer): AttentionBahdanau (
  )
)
torch.Size([128, 16, 50])
torch.Size([128, 50])
torch.Size([128, 50])


In [73]:
class seq2seqAttention(torch.nn.Module):
    
    def __init__(
        self,
        src_emb_dim=100,
        trg_emb_dim=100,
        src_hidden_dim=50,
        trg_hidden_dim=50,
        src_vocab_size=999,
        trg_vocab_size=999,
        src_pad_token=0,
        trg_pad_token=0,
        src_nlayer=1,
        trg_nlayer=1,
        batch_first=True,
        src_bidirect=True,
        batch_size=128,
        dropout=0.0,
        attn_method='dot'
    ):
        super(seq2seqAttention, self).__init__()
        # parameters
        self.src_emb_dim = src_emb_dim
        self.trg_emb_dim = trg_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.src_nlayer = src_nlayer
        self.trg_nlayer = trg_nlayer
        self.batch_first = batch_first
        self.src_bidirect = src_bidirect
        self.batch_size = batch_size
        self.dropout = dropout
        self.attn_method=attn_method
        
        self.src_num_directions = 1
        if self.src_bidirect:
            self.src_hidden_dim = src_hidden_dim // 2
            self.src_num_directions = 2
        
        # source embedding and target embedding
        # the same for summarization.
        self.embedding = torch.nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            padding_idx=0
        ).cuda()
        torch.nn.init.uniform(self.embedding.weight, -1.0, 1.0)
        # encoder
        self.encoder = torch.nn.LSTM(
            input_size=self.src_emb_dim,
            hidden_size=self.src_hidden_dim,
            num_layers=self.src_nlayer,
            batch_first=self.batch_first,
            dropout=self.dropout,
            bidirectional=self.src_bidirect
        ).cuda()
        # decoder
        self.decoder = LSTMAttention(
            input_size=self.trg_emb_dim,
            hidden_size=self.trg_hidden_dim,
            batch_first=self.batch_first,
            attn_method=self.attn_method
        ).cuda()
        # encoder to decoder
        self.encoder2decoder = torch.nn.Linear(
            self.src_hidden_dim*self.src_num_directions,
            self.trg_hidden_dim
        ).cuda()
        torch.nn.init.constant(self.encoder2decoder.bias, 0.0)
        # decoder to vocab
        self.decoder2vocab = torch.nn.Linear(
            self.trg_hidden_dim,
            self.trg_vocab_size
        ).cuda()
        torch.nn.init.constant(self.decoder2vocab.bias, 0.0)
        
    def forward(self, input_src, input_trg):
        src_emb = self.embedding(input_src)
        trg_emb = self.embedding(input_trg)
        
        batch_size = input_src.size(1)
        if self.batch_first:
            batch_size = input_src.size(0)

        h0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()
        
        c0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()

        src_h, (src_h_t, src_c_t) = self.encoder(
            src_emb, 
            (h0_encoder, c0_encoder)
        )

        if self.src_bidirect:
            h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
            c_t = torch.cat((src_c_t[-1], src_c_t[-2]), 1)
        else:
            h_t = src_h_t[-1]
            c_t = src_c_t[-1]
                        
        decoder_h0 = self.encoder2decoder(h_t)
        decoder_h0 = torch.nn.Tanh()(decoder_h0)
        decoder_c0 = c_t
        
        encoder_hy = src_h.transpose(0,1)
        
        trg_h, (_, _) = self.decoder(
            trg_emb,
            (decoder_h0, decoder_c0),
            encoder_hy
        )
        
        trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1),
            trg_h.size(2)
        )
                
        decoder_output = self.decoder2vocab(trg_h_reshape)
        decoder_output = decoder_output.view(
            trg_h.size(0),
            trg_h.size(1),
            decoder_output.size(1)
        )
        
        return decoder_output
    
    def decode(self, logits):
        logits_reshape = logits.view(-1, self.trg_vocab_size)
        word_probs = torch.nn.Softmax()(logits_reshape)
        word_probs = word_probs.view(
            logits.size(0), logits.size(1), logits.size(2)
        )

        return word_probs

model = seq2seqAttention(
    src_emb_dim=100,
    trg_emb_dim=100,
    src_hidden_dim=50,
    trg_hidden_dim=50,
    src_vocab_size=999,
    trg_vocab_size=999,
    src_pad_token=0,
    trg_pad_token=0,
    src_nlayer=2,
    trg_nlayer=1,
    batch_first=True,
    src_bidirect=True,
    batch_size=128,
    dropout=0.0,
    attn_method='general'
).cuda()

print model

sen_in = Variable(torch.LongTensor(128, 16).fill_(10))
sen_out = Variable(torch.LongTensor(128, 18).fill_(9))
out = model(sen_in.cuda(), sen_out.cuda())
model.decode(out)

seq2seqAttention (
  (embedding): Embedding(999, 100, padding_idx=0)
  (encoder): LSTM(100, 25, num_layers=2, batch_first=True, bidirectional=True)
  (decoder): LSTMAttention (
    (lstm_input_vec): Linear (100 -> 200)
    (lstm_hidden_vec): Linear (50 -> 200)
    (attn_layer): Attention (
      (attn_in): Linear (50 -> 50)
      (attn_out): Linear (100 -> 50)
    )
  )
  (encoder2decoder): Linear (50 -> 50)
  (decoder2vocab): Linear (50 -> 999)
)


Variable containing:
( 0 ,.,.) = 
1.00000e-03 *
  0.9695  1.1119  0.9962  ...   0.9795  1.0103  1.0923
  0.9778  1.1319  0.9720  ...   0.9968  1.0123  1.1000
  0.9820  1.1432  0.9588  ...   1.0053  1.0133  1.1026
           ...             ⋱             ...          
  0.9875  1.1601  0.9429  ...   1.0148  1.0160  1.1027
  0.9875  1.1601  0.9429  ...   1.0148  1.0160  1.1027
  0.9875  1.1602  0.9429  ...   1.0148  1.0160  1.1027

( 1 ,.,.) = 
1.00000e-03 *
  0.9695  1.1119  0.9962  ...   0.9795  1.0103  1.0923
  0.9778  1.1319  0.9720  ...   0.9968  1.0123  1.1000
  0.9820  1.1432  0.9588  ...   1.0053  1.0133  1.1026
           ...             ⋱             ...          
  0.9875  1.1601  0.9429  ...   1.0148  1.0160  1.1027
  0.9875  1.1601  0.9429  ...   1.0148  1.0160  1.1027
  0.9875  1.1602  0.9429  ...   1.0148  1.0160  1.1027

( 2 ,.,.) = 
1.00000e-03 *
  0.9695  1.1119  0.9962  ...   0.9795  1.0103  1.0923
  0.9778  1.1319  0.9720  ...   0.9968  1.0123  1.1000
  0.9820  1.1432