In [2]:
import re
import os
import argparse
import shutil
import numpy as np

import torch
import torch.nn.functional as F
from torch.autograd import Variable

from data_utils import *

In [83]:
class AttentionBahdanau(torch.nn.Module):

    def __init__(self):
        super(AttentionBahdanau, self).__init__()   
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        
    def forward(self, last_dehy, enhy):
        dehy_new = last_dehy.unsqueeze(2)

        attn = torch.bmm(enhy, dehy_new).squeeze(2)
        attn = self.softmax_(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))
        h_attn = torch.bmm(attn2, enhy).squeeze(1)
        h_attn = self.tanh_(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionBahdanau().cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionBahdanau (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
 0.5565  0.5496  0.5041  ...   0.4646  0.2192  0.5810
 0.0982  0.6457  0.6537  ...   0.7329  0.3234  0.3716
 0.4448  0.6032  0.3324  ...   0.6051  0.4891  0.5156
          ...             ⋱             ...          
 0.4982  0.4867  0.6388  ...   0.5807  0.4730  0.5565
 0.3614  0.6302  0.4527  ...   0.5910  0.6475  0.4872
 0.4764  0.6593  0.3353  ...   0.5504  0.4830  0.4948
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0172  0.0176  0.0928  ...   0.0111  0.0369  0.1967
 0.0391  0.0008  0.0032  ...   0.0023  0.0017  0.0156
 0.0173  0.0009  0.0059  ...   0.0974  0.0048  0.0005
          ...             ⋱             ...          
 0.0095  0.0022  0.0172  ...   0.0623  0.0137  0.0332
 0.0265  0.0606  0.0013  ...   0.0066  0.0106  0.0642
 0.0145  0.0124  0.0085  ...   0.0061  0.0271  0.3162
[torch.cuda.FloatTensor of size 128x

In [84]:
class AttentionLuong(torch.nn.Module):
    
    def __init__(
        self,
        attn_method='luong_dot',
        hidden_size=100,
        bias=False
    ):
        super(AttentionLuong, self).__init__()
        self.method = attn_method.lower()
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        
        if self.method == 'luong_concat':
            self.attn_in = torch.nn.Sequential(
                torch.nn.Linear(
                    self.hidden_size*2,
                    self.hidden_size,
                    bias=self.bias
                ),
                torch.nn.Linear(self.hidden_size, 1, bias=self.bias)
            ).cuda()
        else:
            if self.method == 'luong_general':
                self.attn_in = torch.nn.Linear(
                    self.hidden_size, 
                    self.hidden_size,
                    bias=self.bias
                ).cuda()
                
        self.attn_out = torch.nn.Linear(
            self.hidden_size*2,
            self.hidden_size,
            bias=self.bias
        ).cuda()
        
    def forward(self, dehy, enhy):
        dehy_new = dehy.unsqueeze(2)
        enhy_new = enhy
        
        if self.method == 'luong_concat':
            dehy_rep = dehy.unsqueeze(1)
            dehy_rep = dehy_rep.repeat(1, enhy.size(1), 1)
            cat_hy = torch.cat((enhy, dehy_rep), 2)
            attn = self.attn_in(cat_hy).squeeze(2)
        else:
            if self.method == 'luong_general':
                enhy_new = self.attn_in(enhy)
        
            attn = torch.bmm(enhy_new, dehy_new).squeeze(2)
        
        attn = self.softmax_(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))

        attn_enhy = torch.bmm(attn2, enhy_new).squeeze(1)
        
        h_attn = self.attn_out(torch.cat((attn_enhy, dehy), 1))
        h_attn = self.tanh_(h_attn)

        return h_attn, attn

rd1 = Variable(torch.FloatTensor(torch.rand([128, 100]))).cuda()
rd2 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
model = AttentionLuong(hidden_size=100).cuda()
print model
print rd1.size()
print rd2.size()
hh, cc = model(rd1, rd2)
print hh
print cc

AttentionLuong (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (attn_out): Linear (200 -> 100)
)
torch.Size([128, 100])
torch.Size([128, 16, 100])
Variable containing:
-5.2184e-01  6.2113e-02 -1.0974e-01  ...  -5.3795e-01 -1.5195e-01 -1.7976e-01
-5.2754e-01  2.3644e-02  2.4752e-01  ...  -3.2898e-01 -1.1490e-01 -4.1162e-01
-5.3335e-01 -8.4141e-02 -1.3482e-02  ...  -3.2721e-01 -1.1471e-01 -3.2949e-01
                ...                   ⋱                   ...                
-3.7479e-01  7.8702e-02  1.1285e-01  ...  -4.5370e-01 -3.5352e-02 -3.3908e-01
-4.2267e-01  2.5574e-01  2.4523e-02  ...  -4.6854e-01  1.4449e-01 -2.4817e-01
-4.9821e-01 -8.5791e-03  2.9705e-01  ...  -3.7256e-01 -1.1203e-01 -1.0894e-01
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0191  0.0194  0.2658  ...   0.0536  0.2368  0.0033
 0.0019  0.0076  0.0833  ...   0.0100  0.0030  0.1161
 0.0005  0.0340  0.0017  ...   0.0085  0.0432  0.1197
          ...             ⋱             ...          

In [85]:
class GRUDecoder(torch.nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        attn_method='bahdanau',
        batch_first=True
    ):
        super(GRUDecoder, self).__init__()
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layer = num_layers
        self.batch_first = batch_first
        self.attn_method = attn_method.lower()
        
        self.softmax_ = torch.nn.Softmax().cuda()
        self.tanh_ = torch.nn.Tanh().cuda()
        self.sigmoid_ = torch.nn.Sigmoid().cuda()
        
        if self.attn_method == 'vanilla':
            self.gru_ = torch.nn.GRUCell(
                self.input_size, 
                self.hidden_size
            )
        elif self.attn_method == 'bahdanau':
            self.gru_ = torch.nn.GRUCell(
                self.input_size+self.hidden_size, 
                self.hidden_size
            )
            self.attn_layer = AttentionBahdanau().cuda()
            
        else:
            self.gru_ = torch.nn.GRUCell(
                self.input_size+self.hidden_size, 
                self.hidden_size
            )
            self.attn_layer = AttentionLuong(
                attn_method=self.attn_method, 
                hidden_size=self.hidden_size
            ).cuda()
        
    def forward(self, input_, hidden_, encoder_hy):
            
        if self.batch_first:
            input_ = input_.transpose(0,1)
            
        output_ = []
        if self.attn_method == 'vanilla':
            for k in range(input_.size(0)):
                hidden_ = self.gru_(input_[k], hidden_)
                output_.append(hidden_)
                
        elif self.attn_method == 'bahdanau':
            for k in range(input_.size(0)):
                h_attn, attn = self.attn_layer(hidden_, encoder_hy.transpose(0,1))
                x_input = torch.cat((input_[k], h_attn), 1)
                hidden_ = self.gru_(x_input, hidden_)
                output_.append(hidden_)
        else:
            batch_size = input_.size(1)
            h_attn = Variable(
                torch.FloatTensor(torch.zeros(batch_size, self.hidden_size))
            ).cuda()
            for k in range(input_.size(0)):
                x_input = torch.cat((input_[k], h_attn), 1)
                hidden_ = self.gru_(x_input, hidden_)
                h_attn, attn = self.attn_layer(hidden_, encoder_hy.transpose(0,1))
                output_.append(hidden_)
            
        len_seq = input_.size(0)
        batch_size, hidden_size = output_[0].size()
        output_ = torch.cat(output_, 0).view(
            len_seq, 
            batch_size, 
            hidden_size
        )
        
        if self.batch_first:
            output_ = output_.transpose(0,1)
            
        return output_, hidden_
    
rd1 = Variable(torch.FloatTensor(torch.rand([128, 16, 100]))).cuda()
hd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
cd2 = Variable(torch.FloatTensor(torch.rand([128, 50]))).cuda()
hid2 = (hd2, cd2)
ctx = Variable(torch.rand([16, 128, 50])).cuda()
model = GRUDecoder(
    input_size=100, 
    hidden_size=50,
    num_layers=1,
    batch_first=True,
    attn_method='Luong'
).cuda()
print model
aa, bb = model(rd1, hd2, ctx)
print aa.size()
#print bb.size()

GRUDecoder (
  (softmax_): Softmax ()
  (tanh_): Tanh ()
  (sigmoid_): Sigmoid ()
  (gru_): GRUCell(150, 50)
  (attn_layer): AttentionLuong (
    (softmax_): Softmax ()
    (tanh_): Tanh ()
    (attn_out): Linear (100 -> 50)
  )
)
torch.Size([128, 16, 50])


In [86]:
class seq2seqAttention(torch.nn.Module):
    
    def __init__(
        self,
        src_emb_dim=100,
        trg_emb_dim=100,
        src_hidden_dim=50,
        trg_hidden_dim=50,
        src_vocab_size=999,
        trg_vocab_size=999,
        src_pad_token=0,
        trg_pad_token=0,
        src_nlayer=1,
        trg_nlayer=1,
        batch_first=True,
        src_bidirect=True,
        batch_size=128,
        dropout=0.0,
        attn_method='vanilla'
    ):
        super(seq2seqAttention, self).__init__()
        # parameters
        self.src_emb_dim = src_emb_dim
        self.trg_emb_dim = trg_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.src_nlayer = src_nlayer
        self.trg_nlayer = trg_nlayer
        self.batch_first = batch_first
        self.src_bidirect = src_bidirect
        self.batch_size = batch_size
        self.dropout = dropout
        self.attn_method=attn_method
        
        self.src_num_directions = 1
        if self.src_bidirect:
            self.src_hidden_dim = src_hidden_dim // 2
            self.src_num_directions = 2
        
        # source embedding and target embedding
        # the same for summarization.
        self.embedding = torch.nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            padding_idx=0
        ).cuda()
        torch.nn.init.uniform(self.embedding.weight, -1.0, 1.0)
        # encoder
        self.encoder = torch.nn.LSTM(
            input_size=self.src_emb_dim,
            hidden_size=self.src_hidden_dim,
            num_layers=self.src_nlayer,
            batch_first=self.batch_first,
            dropout=self.dropout,
            bidirectional=self.src_bidirect
        ).cuda()
        # decoder
        self.decoder = GRUDecoder(
            input_size=self.trg_emb_dim,
            hidden_size=self.trg_hidden_dim,
            batch_first=self.batch_first,
            attn_method=self.attn_method
        ).cuda()
        # encoder to decoder
        self.encoder2decoder = torch.nn.Linear(
            self.src_hidden_dim*self.src_num_directions,
            self.trg_hidden_dim
        ).cuda()
        torch.nn.init.constant(self.encoder2decoder.bias, 0.0)
        # decoder to vocab
        self.decoder2vocab = torch.nn.Linear(
            self.trg_hidden_dim,
            self.trg_vocab_size
        ).cuda()
        torch.nn.init.constant(self.decoder2vocab.bias, 0.0)
        
    def forward(self, input_src, input_trg):
        src_emb = self.embedding(input_src)
        trg_emb = self.embedding(input_trg)
        
        batch_size = input_src.size(1)
        if self.batch_first:
            batch_size = input_src.size(0)

        h0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()
        
        c0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()

        src_h, (src_h_t, src_c_t) = self.encoder(
            src_emb, 
            (h0_encoder, c0_encoder)
        )

        if self.src_bidirect:
            h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
            c_t = torch.cat((src_c_t[-1], src_c_t[-2]), 1)
        else:
            h_t = src_h_t[-1]
            c_t = src_c_t[-1]
                        
        decoder_h0 = self.encoder2decoder(h_t)
        decoder_h0 = F.tanh(decoder_h0)
        decoder_c0 = c_t
        
        encoder_hy = src_h.transpose(0,1)
        
        trg_h, _ = self.decoder(
            trg_emb,
            decoder_h0,
            encoder_hy
        )
        
        trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1),
            trg_h.size(2)
        )
                
        decoder_output = self.decoder2vocab(trg_h_reshape)
        decoder_output = decoder_output.view(
            trg_h.size(0),
            trg_h.size(1),
            decoder_output.size(1)
        )
        
        return decoder_output
    
    def decode(self, logits):
        logits_reshape = logits.view(-1, self.trg_vocab_size)
        word_probs = F.softmax(logits_reshape)
        word_probs = word_probs.view(
            logits.size(0), logits.size(1), logits.size(2)
        )

        return word_probs

model = seq2seqAttention(
    src_emb_dim=100,
    trg_emb_dim=100,
    src_hidden_dim=50,
    trg_hidden_dim=50,
    src_vocab_size=999,
    trg_vocab_size=999,
    src_pad_token=0,
    trg_pad_token=0,
    src_nlayer=2,
    trg_nlayer=1,
    batch_first=True,
    src_bidirect=True,
    batch_size=128,
    dropout=0.0,
    attn_method='vanilla'
).cuda()

print model

sen_in = Variable(torch.LongTensor(128, 16).fill_(10))
sen_out = Variable(torch.LongTensor(128, 18).fill_(9))
out = model(sen_in.cuda(), sen_out.cuda())
model.decode(out)

seq2seqAttention (
  (embedding): Embedding(999, 100, padding_idx=0)
  (encoder): LSTM(100, 25, num_layers=2, batch_first=True, bidirectional=True)
  (decoder): GRUDecoder (
    (softmax_): Softmax ()
    (tanh_): Tanh ()
    (sigmoid_): Sigmoid ()
    (gru_): GRUCell(100, 50)
  )
  (encoder2decoder): Linear (50 -> 50)
  (decoder2vocab): Linear (50 -> 999)
)


Variable containing:
( 0 ,.,.) = 
1.00000e-03 *
  1.0488  0.9827  1.0681  ...   1.0334  1.0123  1.1476
  1.0045  0.9636  1.1588  ...   1.0279  1.0112  1.2143
  0.9697  0.9519  1.2012  ...   1.0221  1.0151  1.2419
           ...             ⋱             ...          
  0.9219  0.9552  1.2040  ...   0.9968  1.0376  1.2889
  0.9221  0.9553  1.2035  ...   0.9967  1.0376  1.2894
  0.9223  0.9554  1.2031  ...   0.9965  1.0376  1.2898

( 1 ,.,.) = 
1.00000e-03 *
  1.0488  0.9827  1.0681  ...   1.0334  1.0123  1.1476
  1.0045  0.9636  1.1588  ...   1.0279  1.0112  1.2143
  0.9697  0.9519  1.2012  ...   1.0221  1.0151  1.2419
           ...             ⋱             ...          
  0.9219  0.9552  1.2040  ...   0.9968  1.0376  1.2889
  0.9221  0.9553  1.2035  ...   0.9967  1.0376  1.2894
  0.9223  0.9554  1.2031  ...   0.9965  1.0376  1.2898

( 2 ,.,.) = 
1.00000e-03 *
  1.0488  0.9827  1.0681  ...   1.0334  1.0123  1.1476
  1.0045  0.9636  1.1588  ...   1.0279  1.0112  1.2143
  0.9697  0.9519

In [3]:
rnn = torch.nn.LSTM(9, 5, num_layers=3, bidirectional=False, batch_first=True)

input = Variable(torch.randn(128, 3, 9))
h0 = Variable(torch.randn(3, 128, 5))
c0 = Variable(torch.randn(3, 128, 5))
output, hn = rnn(input, (h0, c0))
hh, cc = hn
print hh.size()
print cc.size()
print hh[-1].size()
print cc[-1].size()
print output.size()

torch.Size([3, 128, 5])
torch.Size([3, 128, 5])
torch.Size([128, 5])
torch.Size([128, 5])
torch.Size([128, 3, 5])


In [58]:
aa = torch.LongTensor([[1, 2], [2, 3]]) 
bb = torch.LongTensor([[2, 3], [4, 4]])
print aa
print bb
print aa*bb


 1  2
 2  3
[torch.LongTensor of size 2x2]


 2  3
 4  4
[torch.LongTensor of size 2x2]


  2   6
  8  12
[torch.LongTensor of size 2x2]

