In [1]:
import re
import os
import argparse
import shutil
import numpy as np

import torch
import torch.nn.functional as F
from torch.autograd import Variable

from data_utils import *

In [32]:
class AttentionBahdanau(torch.nn.Module):

    def __init__(self):
        super(AttentionBahdanau, self).__init__()   
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        
    def forward(self, last_dehy, enhy):
        dehy_new = last_dehy.unsqueeze(2)

        attn = torch.bmm(enhy, dehy_new).squeeze(2)
        attn = self.softmax_(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))
        h_attn = torch.bmm(attn2, enhy).squeeze(1)
        h_attn = self.tanh_(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionBahdanau().cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionBahdanau (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
 0.5396  0.6086  0.5796  ...   0.3576  0.6223  0.6567
 0.5873  0.5491  0.4179  ...   0.4173  0.4066  0.5737
 0.2417  0.3823  0.5520  ...   0.5830  0.6006  0.5076
          ...             ⋱             ...          
 0.5049  0.6594  0.5067  ...   0.4272  0.4249  0.4203
 0.3635  0.4875  0.3560  ...   0.4256  0.3953  0.5253
 0.4503  0.2717  0.3971  ...   0.3192  0.5800  0.5259
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.3153  0.0078  0.0300  ...   0.0033  0.1887  0.0077
 0.2826  0.0004  0.0251  ...   0.0910  0.0671  0.1729
 0.1233  0.0360  0.0060  ...   0.0014  0.1746  0.0039
          ...             ⋱             ...          
 0.2122  0.0175  0.0050  ...   0.0494  0.0191  0.0156
 0.0481  0.1652  0.0090  ...   0.0024  0.0776  0.1026
 0.0036  0.0835  0.0589  ...   0.0077  0.0259  0.0058
[torch.cuda.FloatTensor of size 128x

In [33]:
class AttentionLuong(torch.nn.Module):
    
    def __init__(
        self,
        attn_method='luong_dot',
        hidden_size=100,
        bias=False
    ):
        super(AttentionLuong, self).__init__()
        self.method = attn_method.lower()
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        
        if self.method == 'luong_concat':
            self.attn_in = torch.nn.Sequential(
                torch.nn.Linear(
                    self.hidden_size*2,
                    self.hidden_size,
                    bias=self.bias
                ),
                torch.nn.Linear(self.hidden_size, 1, bias=self.bias)
            ).cuda()
        else:
            if self.method == 'luong_general':
                self.attn_in = torch.nn.Linear(
                    self.hidden_size, 
                    self.hidden_size,
                    bias=self.bias
                ).cuda()
                
        self.attn_out = torch.nn.Linear(
            self.hidden_size*2,
            self.hidden_size,
            bias=self.bias
        ).cuda()
        
    def forward(self, dehy, enhy):
        dehy_new = dehy.unsqueeze(2)
        enhy_new = enhy
        
        if self.method == 'luong_concat':
            dehy_rep = dehy.unsqueeze(1)
            dehy_rep = dehy_rep.repeat(1, enhy.size(1), 1)
            cat_hy = torch.cat((enhy, dehy_rep), 2)
            attn = self.attn_in(cat_hy).squeeze(2)
        else:
            if self.method == 'luong_general':
                enhy_new = self.attn_in(enhy)
        
            attn = torch.bmm(enhy_new, dehy_new).squeeze(2)
        
        attn = self.softmax_(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))

        attn_enhy = torch.bmm(attn2, enhy_new).squeeze(1)
        
        h_attn = self.attn_out(torch.cat((attn_enhy, dehy), 1))
        h_attn = self.tanh_(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionLuong(hidden_size=100).cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionLuong (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (attn_out): Linear (200 -> 100)
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
-0.2801 -0.1555  0.2193  ...   0.0339  0.1737  0.3361
-0.3209  0.0235  0.3452  ...  -0.1899 -0.0698  0.3147
-0.1668 -0.1842  0.3620  ...   0.1577 -0.1013  0.0995
          ...             ⋱             ...          
-0.1189 -0.1584  0.3930  ...   0.0569  0.1496  0.2075
-0.3671 -0.2738  0.3652  ...  -0.1218  0.2377  0.2204
-0.4504 -0.3231  0.4279  ...   0.0941  0.0712  0.4057
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0074  0.0024  0.2132  ...   0.0302  0.0023  0.2947
 0.1575  0.2700  0.1064  ...   0.0180  0.1746  0.0701
 0.0199  0.0035  0.1332  ...   0.0035  0.0104  0.0005
          ...             ⋱             ...          
 0.0140  0.0506  0.0553  ...   0.0109  0.0731  0.1343
 0.1408  0.0004  0.0957  ...   0.0031  0.0050  0.0082
 0.0219  0.0002  0.0007  ...   0.2088  0.0058  0.0074
[torc

In [35]:
class LSTMDecoder(torch.nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        attn_method='bahdanau',
        batch_first=True
    ):
        super(LSTMDecoder, self).__init__()
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layer = num_layers
        self.batch_first = batch_first
        self.attn_method = attn_method.lower()
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        self.sigmoid_ = torch.nn.Sigmoid().cuda()
        
        if self.attn_method == 'vanilla':
            self.lstm_input_vec = torch.nn.Linear(
                self.input_size+self.hidden_size,
                self.hidden_size*4
            ).cuda()
            
        elif self.attn_method == 'bahdanau':
            self.lstm_input_vec = torch.nn.Linear(
                self.input_size+self.hidden_size+self.hidden_size,
                self.hidden_size*4
            ).cuda()
            
            self.attn_layer = AttentionBahdanau().cuda()
            
        else:
            self.lstm_input_vec = torch.nn.Linear(
                self.input_size+self.hidden_size,
                self.hidden_size*4
            ).cuda()
            
            self.attn_layer = AttentionLuong(
                attn_method=self.attn_method, 
                hidden_size=self.hidden_size
            ).cuda()
        
    def forward(self, input_, hidden_, encoder_hy):
        # user defined lstm with attention
        def Vanilla_lstm(input_, hidden_, encoder_hy):
            hx, cx = hidden_
            h_input = torch.cat((input_, hx), 1)
            gates = self.lstm_input_vec(h_input)
            ingate, cellgate, forgetgate, outgate = gates.chunk(4,1)
            
            ingate = self.sigmoid_(ingate)
            forgetgate = self.sigmoid_(forgetgate)
            outgate = self.sigmoid_(outgate)
            cellgate = self.tanh_(cellgate)
            
            cy = forgetgate*cx + ingate*cellgate
            hy = outgate*F.tanh(cy)
            
            return hy, cy
        
        def Bahdanau_lstm_attn(input_, hidden_, encoder_hy):
            hx, cx = hidden_
            h_attn, attn = self.attn_layer(hx, encoder_hy.transpose(0,1))
            
            h_input = torch.cat((input_, hx), 1)
            h_input = torch.cat((h_input, h_attn), 1)
            gates = self.lstm_input_vec(h_input)
            ingate, cellgate, forgetgate, outgate = gates.chunk(4,1)
            
            ingate = self.sigmoid_(ingate)
            forgetgate = self.sigmoid_(forgetgate)
            outgate = self.sigmoid_(outgate)
            cellgate = self.tanh_(cellgate)
            
            cy = forgetgate*cx + ingate*cellgate
            hy = outgate*F.tanh(cy)
            
            return hy, cy
        
        def Luong_lstm_attn(input_, hidden_, encoder_hy):
            hx, cx = hidden_
            h_input = torch.cat((input_, hx), 1)
            gates = self.lstm_input_vec(h_input)
            ingate, cellgate, forgetgate, outgate = gates.chunk(4,1)
            
            ingate = self.sigmoid_(ingate)
            forgetgate = self.sigmoid_(forgetgate)
            outgate = self.sigmoid_(outgate)
            cellgate = self.tanh_(cellgate)
            
            cy = forgetgate*cx + ingate*cellgate
            hy = outgate*torch.nn.Tanh()(cy)
            
            h_attn, attn = self.attn_layer(hy, encoder_hy.transpose(0,1))
            
            return h_attn, cy
        
        if self.batch_first:
            input_ = input_.transpose(0,1)
            
        output_ = []
        if self.attn_method == 'vanilla':
            for k in range(input_.size(0)):
                hidden_ = Vanilla_lstm(input_[k], hidden_, encoder_hy)
                output_.append(hidden_[0])
                
        elif self.attn_method == 'bahdanau':
            for k in range(input_.size(0)):
                hidden_ = Bahdanau_lstm_attn(input_[k], hidden_, encoder_hy)
                output_.append(hidden_[0])
        else:
            for k in range(input_.size(0)):
                hidden_ = Luong_lstm_attn(input_[k], hidden_, encoder_hy)
                output_.append(hidden_[0])
            
        len_seq = input_.size(0)
        batch_size, hidden_size = output_[0].size()
        output_ = torch.cat(output_, 0).view(
            len_seq, 
            batch_size, 
            hidden_size
        )
        
        if self.batch_first:
            output_ = output_.transpose(0,1)
            
        return output_, hidden_
    
rd1 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
hd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
cd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
hid2 = (hd2, cd2)
ctx = Variable(torch.rand([16, 128, 50])).cuda()
model = LSTMDecoder(
    input_size=100, 
    hidden_size=50,
    num_layers=1,
    batch_first=True,
    attn_method='vanilla'
).cuda()
print model
aa, bb = model(rd1, hid2, ctx)
print aa.size()
print bb[0].size()
print bb[1].size()

LSTMDecoder (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (sigmoid_): Sigmoid ()
  (lstm_input_vec): Linear (150 -> 200)
)
torch.Size([128, 16, 50])
torch.Size([128, 50])
torch.Size([128, 50])


In [41]:
class seq2seqAttention(torch.nn.Module):
    
    def __init__(
        self,
        src_emb_dim=100,
        trg_emb_dim=100,
        src_hidden_dim=50,
        trg_hidden_dim=50,
        src_vocab_size=999,
        trg_vocab_size=999,
        src_pad_token=0,
        trg_pad_token=0,
        src_nlayer=1,
        trg_nlayer=1,
        batch_first=True,
        src_bidirect=True,
        batch_size=128,
        dropout=0.0,
        attn_method='vanilla'
    ):
        super(seq2seqAttention, self).__init__()
        # parameters
        self.src_emb_dim = src_emb_dim
        self.trg_emb_dim = trg_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.src_nlayer = src_nlayer
        self.trg_nlayer = trg_nlayer
        self.batch_first = batch_first
        self.src_bidirect = src_bidirect
        self.batch_size = batch_size
        self.dropout = dropout
        self.attn_method=attn_method
        
        self.src_num_directions = 1
        if self.src_bidirect:
            self.src_hidden_dim = src_hidden_dim // 2
            self.src_num_directions = 2
        
        # source embedding and target embedding
        # the same for summarization.
        self.embedding = torch.nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            padding_idx=0
        ).cuda()
        torch.nn.init.uniform(self.embedding.weight, -1.0, 1.0)
        # encoder
        self.encoder = torch.nn.LSTM(
            input_size=self.src_emb_dim,
            hidden_size=self.src_hidden_dim,
            num_layers=self.src_nlayer,
            batch_first=self.batch_first,
            dropout=self.dropout,
            bidirectional=self.src_bidirect
        ).cuda()
        # decoder
        self.decoder = LSTMDecoder(
            input_size=self.trg_emb_dim,
            hidden_size=self.trg_hidden_dim,
            batch_first=self.batch_first,
            attn_method=self.attn_method
        ).cuda()
        # encoder to decoder
        self.encoder2decoder = torch.nn.Linear(
            self.src_hidden_dim*self.src_num_directions,
            self.trg_hidden_dim
        ).cuda()
        torch.nn.init.constant(self.encoder2decoder.bias, 0.0)
        # decoder to vocab
        self.decoder2vocab = torch.nn.Linear(
            self.trg_hidden_dim,
            self.trg_vocab_size
        ).cuda()
        torch.nn.init.constant(self.decoder2vocab.bias, 0.0)
        
    def forward(self, input_src, input_trg):
        src_emb = self.embedding(input_src)
        trg_emb = self.embedding(input_trg)
        
        batch_size = input_src.size(1)
        if self.batch_first:
            batch_size = input_src.size(0)

        h0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()
        
        c0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()

        src_h, (src_h_t, src_c_t) = self.encoder(
            src_emb, 
            (h0_encoder, c0_encoder)
        )

        if self.src_bidirect:
            h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
            c_t = torch.cat((src_c_t[-1], src_c_t[-2]), 1)
        else:
            h_t = src_h_t[-1]
            c_t = src_c_t[-1]
                        
        decoder_h0 = self.encoder2decoder(h_t)
        decoder_h0 = F.tanh(decoder_h0)
        decoder_c0 = c_t
        
        encoder_hy = src_h.transpose(0,1)
        
        trg_h, (_, _) = self.decoder(
            trg_emb,
            (decoder_h0, decoder_c0),
            encoder_hy
        )
        
        trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1),
            trg_h.size(2)
        )
                
        decoder_output = self.decoder2vocab(trg_h_reshape)
        decoder_output = decoder_output.view(
            trg_h.size(0),
            trg_h.size(1),
            decoder_output.size(1)
        )
        
        return decoder_output
    
    def decode(self, logits):
        logits_reshape = logits.view(-1, self.trg_vocab_size)
        word_probs = F.softmax(logits_reshape)
        word_probs = word_probs.view(
            logits.size(0), logits.size(1), logits.size(2)
        )

        return word_probs

model = seq2seqAttention(
    src_emb_dim=100,
    trg_emb_dim=100,
    src_hidden_dim=50,
    trg_hidden_dim=50,
    src_vocab_size=999,
    trg_vocab_size=999,
    src_pad_token=0,
    trg_pad_token=0,
    src_nlayer=2,
    trg_nlayer=1,
    batch_first=True,
    src_bidirect=True,
    batch_size=128,
    dropout=0.0,
    attn_method='vanilla'
).cuda()

print model

sen_in = Variable(torch.LongTensor(128, 16).fill_(10))
sen_out = Variable(torch.LongTensor(128, 18).fill_(9))
out = model(sen_in.cuda(), sen_out.cuda())
model.decode(out)

seq2seqAttention (
  (embedding): Embedding(999, 100, padding_idx=0)
  (encoder): LSTM(100, 25, num_layers=2, batch_first=True, bidirectional=True)
  (decoder): LSTMDecoder (
    (softmax_): Softmax ()
    (tanh_): Tanh ()
    (sigmoid_): Sigmoid ()
    (lstm_input_vec): Linear (150 -> 200)
  )
  (encoder2decoder): Linear (50 -> 50)
  (decoder2vocab): Linear (50 -> 999)
)


Variable containing:
( 0 ,.,.) = 
1.00000e-03 *
  0.9973  1.0295  1.0475  ...   0.9865  1.0785  0.9349
  0.9830  1.0463  1.0307  ...   0.9754  1.0833  0.9563
  0.9719  1.0602  1.0218  ...   0.9699  1.0833  0.9657
           ...             ⋱             ...          
  0.9546  1.0872  1.0063  ...   0.9624  1.0809  0.9713
  0.9546  1.0873  1.0062  ...   0.9624  1.0809  0.9713
  0.9546  1.0873  1.0062  ...   0.9624  1.0809  0.9713

( 1 ,.,.) = 
1.00000e-03 *
  0.9973  1.0295  1.0475  ...   0.9865  1.0785  0.9349
  0.9830  1.0463  1.0307  ...   0.9754  1.0833  0.9563
  0.9719  1.0602  1.0218  ...   0.9699  1.0833  0.9657
           ...             ⋱             ...          
  0.9546  1.0872  1.0063  ...   0.9624  1.0809  0.9713
  0.9546  1.0873  1.0062  ...   0.9624  1.0809  0.9713
  0.9546  1.0873  1.0062  ...   0.9624  1.0809  0.9713

( 2 ,.,.) = 
1.00000e-03 *
  0.9973  1.0295  1.0475  ...   0.9865  1.0785  0.9349
  0.9830  1.0463  1.0307  ...   0.9754  1.0833  0.9563
  0.9719  1.0602

In [3]:
rnn = torch.nn.LSTM(9, 5, num_layers=3, bidirectional=False, batch_first=True)

input = Variable(torch.randn(128, 3, 9))
h0 = Variable(torch.randn(3, 128, 5))
c0 = Variable(torch.randn(3, 128, 5))
output, hn = rnn(input, (h0, c0))
hh, cc = hn
print hh.size()
print cc.size()
print hh[-1].size()
print cc[-1].size()
print output.size()

torch.Size([3, 128, 5])
torch.Size([3, 128, 5])
torch.Size([128, 5])
torch.Size([128, 5])
torch.Size([128, 3, 5])
