In [2]:
import re
import os
import argparse
import shutil
import numpy as np

import torch
import torch.nn.functional as F
from torch.autograd import Variable

from data_utils import *

In [3]:
class AttentionBahdanau(torch.nn.Module):

    def __init__(self):
        super(AttentionBahdanau, self).__init__()   
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        
    def forward(self, last_dehy, enhy):
        dehy_new = last_dehy.unsqueeze(2)

        attn = torch.bmm(enhy, dehy_new).squeeze(2)
        attn = self.softmax_(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))
        h_attn = torch.bmm(attn2, enhy).squeeze(1)
        h_attn = self.tanh_(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionBahdanau().cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionBahdanau (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
 0.3733  0.3695  0.4731  ...   0.4381  0.4635  0.4432
 0.5780  0.4864  0.4882  ...   0.5374  0.3941  0.4720
 0.3845  0.5741  0.3348  ...   0.5652  0.4516  0.6332
          ...             ⋱             ...          
 0.6412  0.3669  0.5900  ...   0.5816  0.6887  0.4371
 0.3454  0.5851  0.5945  ...   0.5130  0.5433  0.5959
 0.4553  0.5663  0.6107  ...   0.6211  0.5329  0.5089
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0722  0.0440  0.0019  ...   0.0197  0.1515  0.0038
 0.1378  0.0390  0.1063  ...   0.2416  0.0713  0.0255
 0.0283  0.0121  0.0463  ...   0.0022  0.0164  0.0019
          ...             ⋱             ...          
 0.1872  0.0138  0.0084  ...   0.0043  0.0227  0.0002
 0.0409  0.0240  0.0231  ...   0.5090  0.0159  0.0031
 0.2112  0.0165  0.1647  ...   0.0032  0.2381  0.0456
[torch.cuda.FloatTensor of size 128x

In [4]:
class AttentionLuong(torch.nn.Module):
    
    def __init__(
        self,
        attn_method='luong_dot',
        hidden_size=100,
        bias=False
    ):
        super(AttentionLuong, self).__init__()
        self.method = attn_method.lower()
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        
        if self.method == 'luong_concat':
            self.attn_in = torch.nn.Sequential(
                torch.nn.Linear(
                    self.hidden_size*2,
                    self.hidden_size,
                    bias=self.bias
                ),
                torch.nn.Linear(self.hidden_size, 1, bias=self.bias)
            ).cuda()
        else:
            if self.method == 'luong_general':
                self.attn_in = torch.nn.Linear(
                    self.hidden_size, 
                    self.hidden_size,
                    bias=self.bias
                ).cuda()
                
        self.attn_out = torch.nn.Linear(
            self.hidden_size*2,
            self.hidden_size,
            bias=self.bias
        ).cuda()
        
    def forward(self, dehy, enhy):
        dehy_new = dehy.unsqueeze(2)
        enhy_new = enhy
        
        if self.method == 'luong_concat':
            dehy_rep = dehy.unsqueeze(1)
            dehy_rep = dehy_rep.repeat(1, enhy.size(1), 1)
            cat_hy = torch.cat((enhy, dehy_rep), 2)
            attn = self.attn_in(cat_hy).squeeze(2)
        else:
            if self.method == 'luong_general':
                enhy_new = self.attn_in(enhy)
        
            attn = torch.bmm(enhy_new, dehy_new).squeeze(2)
        
        attn = self.softmax_(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))

        attn_enhy = torch.bmm(attn2, enhy_new).squeeze(1)
        
        h_attn = self.attn_out(torch.cat((attn_enhy, dehy), 1))
        h_attn = self.tanh_(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionLuong(hidden_size=100).cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionLuong (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (attn_out): Linear (200 -> 100)
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
-2.9063e-01 -3.6004e-01 -4.0996e-02  ...  -7.4396e-02 -1.0122e-02 -3.3847e-01
 2.1977e-02 -9.3524e-02 -2.6729e-02  ...   9.6436e-02  9.9872e-02 -8.2764e-02
-1.4010e-01 -3.0645e-01 -8.5560e-02  ...  -1.3069e-01  6.2832e-02 -1.6895e-01
                ...                   ⋱                   ...                
-2.4782e-01 -1.3179e-01  1.6869e-01  ...  -9.0429e-02 -2.9188e-01 -4.1394e-01
-1.2256e-01 -2.4674e-02  5.0530e-02  ...   1.6062e-01 -2.1412e-01 -2.5449e-01
-1.2608e-03 -1.1116e-01 -1.6010e-01  ...  -1.1290e-02 -2.4701e-02 -2.1226e-01
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0077  0.0139  0.0888  ...   0.0784  0.0123  0.0188
 0.0308  0.0108  0.1964  ...   0.0202  0.0120  0.0167
 0.0686  0.0098  0.1259  ...   0.0134  0.0525  0.0387
          ...             ⋱             ...          

In [16]:
class LSTMDecoder(torch.nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        attn_method='bahdanau',
        batch_first=True
    ):
        super(LSTMDecoder, self).__init__()
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layer = num_layers
        self.batch_first = batch_first
        self.attn_method = attn_method.lower()
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        self.sigmoid_ = torch.nn.Sigmoid().cuda()
        
        if self.attn_method == 'vanilla':
            self.lstm_ = torch.nn.LSTMCell(
                self.input_size, 
                self.hidden_size
            )
        elif self.attn_method == 'bahdanau':
            self.lstm_ = torch.nn.LSTMCell(
                self.input_size+self.hidden_size, 
                self.hidden_size
            )
            self.attn_layer = AttentionBahdanau().cuda()
            
        else:
            self.lstm_ = torch.nn.LSTMCell(
                self.input_size+self.hidden_size, 
                self.hidden_size
            )
            self.attn_layer = AttentionLuong(
                attn_method=self.attn_method, 
                hidden_size=self.hidden_size
            ).cuda()
        
    def forward(self, input_, hidden_, encoder_hy):
            
        if self.batch_first:
            input_ = input_.transpose(0,1)
            
        output_ = []
        if self.attn_method == 'vanilla':
            for k in range(input_.size(0)):
                hidden_ = self.lstm_(input_[k], hidden_)
                output_.append(hidden_[0])
                
        elif self.attn_method == 'bahdanau':
            for k in range(input_.size(0)):
                h_attn, attn = self.attn_layer(hidden_[0], encoder_hy.transpose(0,1))
                x_input = torch.cat((input_[k], h_attn), 1)
                hidden_ = self.lstm_(x_input, hidden_)
                output_.append(hidden_[0])
        else:
            batch_size = input_.size(1)
            h_attn = Variable(
                torch.FloatTensor(torch.zeros(batch_size, self.hidden_size))
            ).cuda()
            for k in range(input_.size(0)):
                x_input = torch.cat((input_[k], h_attn), 1)
                hidden_ = self.lstm_(x_input, hidden_)
                h_attn, attn = self.attn_layer(hidden_[0], encoder_hy.transpose(0,1))
                output_.append(hidden_[0])
            
        len_seq = input_.size(0)
        batch_size, hidden_size = output_[0].size()
        output_ = torch.cat(output_, 0).view(
            len_seq, 
            batch_size, 
            hidden_size
        )
        
        if self.batch_first:
            output_ = output_.transpose(0,1)
            
        return output_, hidden_
    
rd1 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
hd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
cd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
hid2 = (hd2, cd2)
ctx = Variable(torch.rand([16, 128, 50])).cuda()
model = LSTMDecoder(
    input_size=100, 
    hidden_size=50,
    num_layers=1,
    batch_first=True,
    attn_method='Luong_concat'
).cuda()
print model
aa, bb = model(rd1, hid2, ctx)
print aa.size()
#print bb.size()

LSTMDecoder (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (sigmoid_): Sigmoid ()
  (lstm_): LSTMCell(150, 50)
  (attn_layer): AttentionLuong (
    (softmax_): Softmax ()
    (tanh_): Tanh ()
    (attn_in): Sequential (
      (0): Linear (100 -> 50)
      (1): Linear (50 -> 1)
    )
    (attn_out): Linear (100 -> 50)
  )
)
torch.Size([128, 16, 50])


In [21]:
class GRUDecoder(torch.nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        attn_method='bahdanau',
        batch_first=True
    ):
        super(GRUDecoder, self).__init__()
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layer = num_layers
        self.batch_first = batch_first
        self.attn_method = attn_method.lower()
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        self.sigmoid_ = torch.nn.Sigmoid().cuda()
        
        if self.attn_method == 'vanilla':
            self.gru_ = torch.nn.GRUCell(
                self.input_size, 
                self.hidden_size
            )
        elif self.attn_method == 'bahdanau':
            self.gru_ = torch.nn.GRUCell(
                self.input_size+self.hidden_size, 
                self.hidden_size
            )
            self.attn_layer = AttentionBahdanau().cuda()
            
        else:
            self.gru_ = torch.nn.GRUCell(
                self.input_size+self.hidden_size, 
                self.hidden_size
            )
            self.attn_layer = AttentionLuong(
                attn_method=self.attn_method, 
                hidden_size=self.hidden_size
            ).cuda()
        
    def forward(self, input_, hidden_, encoder_hy):
            
        if self.batch_first:
            input_ = input_.transpose(0,1)
            
        output_ = []
        if self.attn_method == 'vanilla':
            for k in range(input_.size(0)):
                hidden_ = self.gru_(input_[k], hidden_)
                output_.append(hidden_)
                
        elif self.attn_method == 'bahdanau':
            for k in range(input_.size(0)):
                h_attn, attn = self.attn_layer(hidden_, encoder_hy.transpose(0,1))
                x_input = torch.cat((input_[k], h_attn), 1)
                hidden_ = self.gru_(x_input, hidden_)
                output_.append(hidden_)
        else:
            batch_size = input_.size(1)
            h_attn = Variable(
                torch.FloatTensor(torch.zeros(batch_size, self.hidden_size))
            ).cuda()
            for k in range(input_.size(0)):
                x_input = torch.cat((input_[k], h_attn), 1)
                hidden_ = self.gru_(x_input, hidden_)
                h_attn, attn = self.attn_layer(hidden_, encoder_hy.transpose(0,1))
                output_.append(hidden_)
            
        len_seq = input_.size(0)
        batch_size, hidden_size = output_[0].size()
        output_ = torch.cat(output_, 0).view(
            len_seq, 
            batch_size, 
            hidden_size
        )
        
        if self.batch_first:
            output_ = output_.transpose(0,1)
            
        return output_, hidden_
    
rd1 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
hd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
cd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
hid2 = (hd2, cd2)
ctx = Variable(torch.rand([16, 128, 50])).cuda()
model = GRUDecoder(
    input_size=100, 
    hidden_size=50,
    num_layers=1,
    batch_first=True,
    attn_method='Luong'
).cuda()
print model
aa, bb = model(rd1, hd2, ctx)
print aa.size()
#print bb.size()

GRUDecoder (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (sigmoid_): Sigmoid ()
  (gru_): GRUCell(150, 50)
  (attn_layer): AttentionLuong (
    (softmax_): Softmax ()
    (tanh_): Tanh ()
    (attn_out): Linear (100 -> 50)
  )
)
torch.Size([128, 16, 50])


In [34]:
class Seq2Seq(torch.nn.Module):
    
    def __init__(
        self,
        src_emb_dim=100,
        trg_emb_dim=100,
        src_hidden_dim=50,
        trg_hidden_dim=50,
        src_vocab_size=999,
        trg_vocab_size=999,
        src_pad_token=0,
        trg_pad_token=0,
        src_nlayer=1,
        trg_nlayer=1,
        batch_first=True,
        src_bidirect=True,
        batch_size=128,
        dropout=0.0,
        attn_method='vanilla',
        network_='gru'
    ):
        super(Seq2Seq, self).__init__()
        # parameters
        self.src_emb_dim = src_emb_dim
        self.trg_emb_dim = trg_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.src_nlayer = src_nlayer
        self.trg_nlayer = trg_nlayer
        self.batch_first = batch_first
        self.src_bidirect = src_bidirect
        self.batch_size = batch_size
        self.dropout = dropout
        self.attn_method = attn_method
        self.network_ = network_.lower()
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        self.sigmoid_ = torch.nn.Sigmoid().cuda()
        
        self.src_num_directions = 1
        if self.src_bidirect:
            self.src_hidden_dim = src_hidden_dim // 2
            self.src_num_directions = 2
        
        # source embedding and target embedding
        # the same for summarization.
        self.embedding = torch.nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            padding_idx=0
        ).cuda()
        torch.nn.init.uniform(self.embedding.weight, -1.0, 1.0)
        # choose network
        if self.network_ == 'lstm':
            # encoder
            self.encoder = torch.nn.LSTM(
                input_size=self.src_emb_dim,
                hidden_size=self.src_hidden_dim,
                num_layers=self.src_nlayer,
                batch_first=self.batch_first,
                dropout=self.dropout,
                bidirectional=self.src_bidirect
            ).cuda()
            # decoder
            self.decoder = LSTMDecoder(
                input_size=self.trg_emb_dim,
                hidden_size=self.trg_hidden_dim,
                batch_first=self.batch_first,
                attn_method=self.attn_method
            ).cuda()
        else:
            # encoder
            self.encoder = torch.nn.GRU(
                input_size=self.src_emb_dim,
                hidden_size=self.src_hidden_dim,
                num_layers=self.src_nlayer,
                batch_first=self.batch_first,
                dropout=self.dropout,
                bidirectional=self.src_bidirect
            ).cuda()
            # decoder
            self.decoder = GRUDecoder(
                input_size=self.trg_emb_dim,
                hidden_size=self.trg_hidden_dim,
                batch_first=self.batch_first,
                attn_method=self.attn_method
            ).cuda()
            
        # encoder to decoder
        self.encoder2decoder = torch.nn.Linear(
            self.src_hidden_dim*self.src_num_directions,
            self.trg_hidden_dim
        ).cuda()
        torch.nn.init.constant(self.encoder2decoder.bias, 0.0)
        # decoder to vocab
        self.decoder2vocab = torch.nn.Linear(
            self.trg_hidden_dim,
            self.trg_vocab_size
        ).cuda()
        torch.nn.init.constant(self.decoder2vocab.bias, 0.0)
        
    def forward(self, input_src, input_trg):
        src_emb = self.embedding(input_src)
        trg_emb = self.embedding(input_trg)
        
        batch_size = input_src.size(1)
        if self.batch_first:
            batch_size = input_src.size(0)

        h0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()
        
        if self.network_ == 'lstm':
            c0_encoder = Variable(torch.zeros(
                self.encoder.num_layers*self.src_num_directions,
                self.batch_size,
                self.src_hidden_dim
            ), requires_grad=False).cuda()

            src_h, (src_h_t, src_c_t) = self.encoder(
                src_emb, 
                (h0_encoder, c0_encoder)
            )

            if self.src_bidirect:
                h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
                c_t = torch.cat((src_c_t[-1], src_c_t[-2]), 1)
            else:
                h_t = src_h_t[-1]
                c_t = src_c_t[-1]
                        
            decoder_h0 = self.encoder2decoder(h_t)
            decoder_h0 = self.tanh_(decoder_h0)
            decoder_c0 = c_t
        
            encoder_hy = src_h.transpose(0,1)
        
            trg_h, (_, _) = self.decoder(
                trg_emb,
                (decoder_h0, decoder_c0),
                encoder_hy
            )
        
        else:
            src_h, src_h_t = self.encoder(
                src_emb, 
                h0_encoder
            )

            if self.src_bidirect:
                h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
            else:
                h_t = src_h_t[-1]
                        
            decoder_h0 = self.encoder2decoder(h_t)
            decoder_h0 = self.tanh_(decoder_h0)
        
            encoder_hy = src_h.transpose(0,1)
        
            trg_h, _ = self.decoder(
                trg_emb,
                decoder_h0,
                encoder_hy
            )
        
        trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1),
            trg_h.size(2)
        )
                
        decoder_output = self.decoder2vocab(trg_h_reshape)
        decoder_output = decoder_output.view(
            trg_h.size(0),
            trg_h.size(1),
            decoder_output.size(1)
        )
        
        return decoder_output
    
    def decode(self, logits):
        logits_reshape = logits.view(-1, self.trg_vocab_size)
        word_probs = F.softmax(logits_reshape)
        word_probs = word_probs.view(
            logits.size(0), logits.size(1), logits.size(2)
        )

        return word_probs

model = Seq2Seq(
    src_emb_dim=100,
    trg_emb_dim=100,
    src_hidden_dim=50,
    trg_hidden_dim=50,
    src_vocab_size=999,
    trg_vocab_size=999,
    src_pad_token=0,
    trg_pad_token=0,
    src_nlayer=2,
    trg_nlayer=1,
    batch_first=True,
    src_bidirect=True,
    batch_size=128,
    dropout=0.0,
    attn_method='vanilla',
    network_='gru'
).cuda()

print model

sen_in = Variable(torch.LongTensor(128, 16).fill_(10))
sen_out = Variable(torch.LongTensor(128, 18).fill_(9))
out = model(sen_in.cuda(), sen_out.cuda())
model.decode(out)

Seq2Seq (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (sigmoid_): Sigmoid ()
  (embedding): Embedding(999, 100, padding_idx=0)
  (encoder): GRU(100, 25, num_layers=2, batch_first=True, bidirectional=True)
  (decoder): GRUDecoder (
    (softmax_): Softmax ()
    (tanh_): Tanh ()
    (sigmoid_): Sigmoid ()
    (gru_): GRUCell(100, 50)
  )
  (encoder2decoder): Linear (50 -> 50)
  (decoder2vocab): Linear (50 -> 999)
)


Variable containing:
( 0 ,.,.) = 
1.00000e-03 *
  1.1017  1.2107  1.0278  ...   0.9625  1.0906  1.0502
  1.1098  1.2375  1.0092  ...   0.9213  1.1027  1.0541
  1.0959  1.2471  0.9855  ...   0.9034  1.1090  1.0555
           ...             ⋱             ...          
  1.0524  1.2457  0.9323  ...   0.9058  1.1511  1.0484
  1.0524  1.2457  0.9322  ...   0.9060  1.1514  1.0483
  1.0524  1.2456  0.9321  ...   0.9062  1.1517  1.0483

( 1 ,.,.) = 
1.00000e-03 *
  1.1017  1.2107  1.0278  ...   0.9625  1.0906  1.0502
  1.1098  1.2375  1.0092  ...   0.9213  1.1027  1.0541
  1.0959  1.2471  0.9855  ...   0.9034  1.1090  1.0555
           ...             ⋱             ...          
  1.0524  1.2457  0.9323  ...   0.9058  1.1511  1.0484
  1.0524  1.2457  0.9322  ...   0.9060  1.1514  1.0483
  1.0524  1.2456  0.9321  ...   0.9062  1.1517  1.0483

( 2 ,.,.) = 
1.00000e-03 *
  1.1017  1.2107  1.0278  ...   0.9625  1.0906  1.0502
  1.1098  1.2375  1.0092  ...   0.9213  1.1027  1.0541
  1.0959  1.2471

In [3]:
rnn = torch.nn.LSTM(9, 5, num_layers=3, bidirectional=False, batch_first=True)

input = Variable(torch.randn(128, 3, 9))
h0 = Variable(torch.randn(3, 128, 5))
c0 = Variable(torch.randn(3, 128, 5))
output, hn = rnn(input, (h0, c0))
hh, cc = hn
print hh.size()
print cc.size()
print hh[-1].size()
print cc[-1].size()
print output.size()

torch.Size([3, 128, 5])
torch.Size([3, 128, 5])
torch.Size([128, 5])
torch.Size([128, 5])
torch.Size([128, 3, 5])


In [58]:
aa = torch.LongTensor([[1, 2], [2, 3]]) 
bb = torch.LongTensor([[2, 3], [4, 4]])
print aa
print bb
print aa*bb


 1  2
 2  3
[torch.LongTensor of size 2x2]


 2  3
 4  4
[torch.LongTensor of size 2x2]


  2   6
  8  12
[torch.LongTensor of size 2x2]

