In [1]:
import re
import os
import argparse
import shutil
import numpy as np

import torch
from torch.autograd import Variable

from data_utils import *

In [2]:
rnn = torch.nn.LSTM(9, 5, num_layers=3, bidirectional=False, batch_first=True)

input = Variable(torch.randn(128, 3, 9))
h0 = Variable(torch.randn(3, 128, 5))
c0 = Variable(torch.randn(3, 128, 5))
output, hn = rnn(input, (h0, c0))
hh, cc = hn
print hh.size()
print cc.size()
print hh[-1].size()
print cc[-1].size()
print output.size()

torch.Size([3, 128, 5])
torch.Size([3, 128, 5])
torch.Size([128, 5])
torch.Size([128, 5])
torch.Size([128, 3, 5])


In [3]:
class SoftDotAttention(torch.nn.Module):
    
    def __init__(self, hidden_size):
        super(SoftDotAttention, self).__init__()
        
        self.linear_in = torch.nn.Linear(
            hidden_size,
            hidden_size,
            bias=False
        ).cuda()
        self.linear_out = torch.nn.Linear(
            hidden_size*2,
            hidden_size,
            bias=False
        ).cuda()
        
    def forward(self, input_, encoder_hy):
        
        target = self.linear_in(input_).unsqueeze(2)
        attn = torch.bmm(encoder_hy, target).squeeze(2)
        attn = torch.nn.Softmax()(attn)
        attn2 = attn.view(attn.size(0), 1, attn.size(1))
        
        weighted_context = torch.bmm(attn2, encoder_hy).squeeze(1)
        
        h_attn = torch.cat((weighted_context, input_), 1)
        h_attn = self.linear_out(h_attn)
        h_attn = torch.nn.Tanh()(h_attn)
        
        return h_attn, attn

rd1 = Variable(torch.rand([128, 100])).cuda()
rd2 = Variable(torch.rand([128, 16, 100])).cuda()
model = SoftDotAttention(hidden_size=100).cuda()
print model
hh, cc = model(rd1, rd2)
print hh
print cc

SoftDotAttention (
  (linear_in): Linear (100 -> 100)
  (linear_out): Linear (200 -> 100)
)
Variable containing:
 0.0472  0.1069  0.4855  ...  -0.0105  0.2764 -0.2282
 0.2576 -0.2164  0.2461  ...   0.0927  0.3746 -0.0385
 0.2283 -0.0510  0.1408  ...   0.0626  0.4829 -0.0880
          ...             ⋱             ...          
 0.0657 -0.0529  0.3123  ...  -0.0555  0.4422 -0.1950
 0.2297 -0.1922  0.1218  ...   0.1817  0.2635 -0.2318
 0.0733  0.0217  0.1559  ...  -0.1520  0.4014 -0.1698
[torch.cuda.FloatTensor of size 128x100 (GPU 0)]

Variable containing:
 0.0263  0.0275  0.0647  ...   0.0797  0.0287  0.0307
 0.3163  0.0128  0.0127  ...   0.0084  0.0196  0.0097
 0.0356  0.0361  0.0644  ...   0.0191  0.2002  0.0407
          ...             ⋱             ...          
 0.0826  0.1428  0.0311  ...   0.0354  0.0274  0.0386
 0.0310  0.0753  0.0214  ...   0.1289  0.1041  0.1391
 0.0825  0.1653  0.1374  ...   0.0517  0.0853  0.0218
[torch.cuda.FloatTensor of size 128x16 (GPU 0)]



In [4]:
class LSTMAttentionDot(torch.nn.Module):
    
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        batch_first=False
    ):
        super(LSTMAttentionDot, self).__init__()
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layer = num_layers
        self.batch_first = batch_first
        
        self.lstm_input_w = torch.nn.Linear(
            self.input_size,
            self.hidden_size*4
        ).cuda()
        self.lstm_hidden_w = torch.nn.Linear(
            self.hidden_size,
            self.hidden_size*4
        ).cuda()
        
        self.attn_layer = SoftDotAttention(self.hidden_size).cuda()
        
    def forward(self, input_, hidden_, encoder_hy):
        # user defined lstm with attention
        def attn_lstm(input_, hidden_, encoder_hy):
            hx, cx = hidden_
            gates = self.lstm_input_w(input_) + self.lstm_hidden_w(hx)
            ingate, cellgate, forgetgate, outgate = gates.chunk(4,1)
            
            ingate = torch.nn.Sigmoid()(ingate)
            forgetgate = torch.nn.Sigmoid()(forgetgate)
            outgate = torch.nn.Sigmoid()(outgate)
            cellgate = torch.nn.Tanh()(cellgate)
            
            cy = forgetgate*cx + ingate*cellgate
            hy = outgate*torch.nn.Tanh()(cy)
            
            h_attn, attn = self.attn_layer(hy, encoder_hy.transpose(0, 1))
            
            return h_attn, cy
        
        if self.batch_first:
            input_ = input_.transpose(0,1)
            
        output_ = []
        for k in range(input_.size(0)):
            hidden_ = attn_lstm(input_[k], hidden_, encoder_hy)
            output_.append(hidden_[0])
        
        len_seq = input_.size()[0]
        batch_size, hidden_size = output_[0].size()
        output_ = torch.cat(output_, 0).view(
            len_seq, 
            batch_size, 
            hidden_size
        )
        
        if self.batch_first:
            output_ = output_.transpose(0,1)
            
        return output_, hidden_
    
rd1 = Variable(torch.rand([128, 16, 100])).cuda()
hd2 = Variable(torch.rand([128, 50])).cuda()
cd2 = Variable(torch.rand([128, 50])).cuda()
hid2 = (hd2, cd2)
ctx = Variable(torch.rand([16, 128, 50])).cuda()
model = LSTMAttentionDot(
    input_size=100, 
    hidden_size=50, 
    num_layers=1, 
    batch_first=True
).cuda()
print model
aa, bb = model(rd1, hid2, ctx)
print aa.size()

LSTMAttentionDot (
  (lstm_input_w): Linear (100 -> 200)
  (lstm_hidden_w): Linear (50 -> 200)
  (attn_layer): SoftDotAttention (
    (linear_in): Linear (50 -> 50)
    (linear_out): Linear (100 -> 50)
  )
)
torch.Size([128, 16, 50])


In [26]:
class seq2seqAttention(torch.nn.Module):
    
    def __init__(
        self,
        src_emb_dim=100,
        trg_emb_dim=100,
        src_hidden_dim=50,
        trg_hidden_dim=50,
        src_vocab_size=999,
        trg_vocab_size=999,
        src_pad_token=0,
        trg_pad_token=0,
        src_nlayer=2,
        trg_nlayer=1,
        batch_first=True,
        src_bidirect=True,
        batch_size=128,
        dropout=0.0
    ):
        super(seq2seqAttention, self).__init__()
        # parameters
        self.src_emb_dim = src_emb_dim
        self.trg_emb_dim = trg_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.src_nlayer = src_nlayer
        self.trg_nlayer = trg_nlayer
        self.batch_first = batch_first
        self.src_bidirect = src_bidirect
        self.batch_size = batch_size
        self.dropout = dropout
        
        self.src_num_directions = 1
        if self.src_bidirect:
            self.src_hidden_dim = src_hidden_dim // 2
            self.src_num_directions = 2
        
        
        # source embedding
        self.src_embedding = torch.nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            padding_idx=0
        ).cuda()
        torch.nn.init.normal(
            self.src_embedding.weight, 
            mean=0.0, 
            std=0.02
        )
        # targe embedding
        self.trg_embedding = torch.nn.Embedding(
            self.trg_vocab_size,
            self.trg_emb_dim,
            padding_idx=0
        ).cuda()
        torch.nn.init.normal(
            self.trg_embedding.weight,
            mean=0.0,
            std=0.02
        )
        # encoder
        self.encoder = torch.nn.LSTM(
            input_size=self.src_emb_dim,
            hidden_size=self.src_hidden_dim,
            num_layers=self.src_nlayer,
            batch_first=self.batch_first,
            dropout=self.dropout,
            bidirectional=self.src_bidirect
        ).cuda()
        # decoder
        self.decoder = LSTMAttentionDot(
            input_size=self.trg_emb_dim,
            hidden_size=self.trg_hidden_dim,
            batch_first=self.batch_first
        ).cuda()
        
        # encoder to decoder
        self.encoder2decoder = torch.nn.Linear(
            self.src_hidden_dim*self.src_num_directions,
            self.trg_hidden_dim
        ).cuda()
        torch.nn.init.constant(self.encoder2decoder.bias, 0.0)
        # decoder to vocab
        self.decoder2vocab = torch.nn.Linear(
            self.trg_hidden_dim,
            self.trg_vocab_size
        ).cuda()
        torch.nn.init.constant(self.decoder2vocab.bias, 0.0)
        
    def forward(self, input_src, input_trg):
        
        src_emb = self.src_embedding(input_src)
        trg_emb = self.trg_embedding(input_trg)
        
        print src_emb.size()
        print trg_emb.size()
        
        batch_size = input_src.size(1)
        if self.batch_first:
            batch_size = input_src.size(0)
            
        print batch_size
        
        h0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()
        
        c0_encoder = Variable(torch.zeros(
            self.encoder.num_layers*self.src_num_directions,
            self.batch_size,
            self.src_hidden_dim
        ), requires_grad=False).cuda()
        
        print h0_encoder.size()
        print c0_encoder.size()
        
        src_h, (src_h_t, src_c_t) = self.encoder(
            src_emb, 
            (h0_encoder, c0_encoder)
        )
        
        print 'src_h'
        print src_h.size()
        print src_h_t.size()
        print src_c_t.size()
        
        if self.src_bidirect:
            h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
            c_t = torch.cat((src_c_t[-1], src_c_t[-2]), 1)
        else:
            h_t = src_h_t[-1]
            c_t = src_c_t[-1]
            
        print 'h_t'
        print h_t.size()
        print c_t.size()
            
        decoder_h0 = self.encoder2decoder(h_t)
        decoder_h0 = torch.nn.Tanh()(decoder_h0)
        decoder_c0 = c_t
        
        print 'h0'
        print decoder_h0.size()
        print decoder_c0.size()
        
        encoder_hy = src_h.transpose(0,1)
        
        trg_h, (_, _) = self.decoder(
            trg_emb,
            (decoder_h0, decoder_c0),
            encoder_hy
        )
        
        print 'trg_h'
        print trg_h.size()
        
        trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1),
            trg_h.size(2)
        )
        
        print 'reshape'
        print trg_h_reshape.size()
        
        decoder_output = self.decoder2vocab(trg_h_reshape)
        decoder_output = decoder_output.view(
            trg_h.size(0),
            trg_h.size(1),
            decoder_output.size(1)
        )
        
        return decoder_output

model = seq2seqAttention(
    src_emb_dim=100,
    trg_emb_dim=100,
    src_hidden_dim=50,
    trg_hidden_dim=50,
    src_vocab_size=999,
    trg_vocab_size=999,
    src_pad_token=0,
    trg_pad_token=0,
    src_nlayer=2,
    trg_nlayer=1,
    batch_first=True,
    src_bidirect=True,
    batch_size=128,
    dropout=0.0
).cuda()

print model

sen_in = Variable(torch.LongTensor(128, 16).fill_(10))
sen_out = Variable(torch.LongTensor(128, 16).fill_(9))
model(sen_in.cuda(), sen_out.cuda())

seq2seqAttention (
  (src_embedding): Embedding(999, 100, padding_idx=0)
  (trg_embedding): Embedding(999, 100, padding_idx=0)
  (encoder): LSTM(100, 25, num_layers=2, batch_first=True, bidirectional=True)
  (decoder): LSTMAttentionDot (
    (lstm_input_w): Linear (100 -> 200)
    (lstm_hidden_w): Linear (50 -> 200)
    (attn_layer): SoftDotAttention (
      (linear_in): Linear (50 -> 50)
      (linear_out): Linear (100 -> 50)
    )
  )
  (encoder2decoder): Linear (50 -> 50)
  (decoder2vocab): Linear (50 -> 999)
)
torch.Size([128, 16, 100])
torch.Size([128, 16, 100])
128
torch.Size([4, 128, 25])
torch.Size([4, 128, 25])
src_h
torch.Size([128, 16, 50])
torch.Size([4, 128, 25])
torch.Size([4, 128, 25])
h_t
torch.Size([128, 50])
torch.Size([128, 50])
h0
torch.Size([128, 50])
torch.Size([128, 50])
trg_h
torch.Size([128, 16, 50])
reshape
torch.Size([2048, 50])


Variable containing:
( 0 ,.,.) = 
 -1.1094e-02  2.2234e-02  7.0412e-03  ...  -1.3893e-02 -2.1753e-02  4.0296e-03
 -3.7722e-03  1.8043e-02  6.6307e-03  ...  -1.9105e-02 -2.4647e-02  5.9177e-03
  1.0599e-04  1.5275e-02  6.8597e-03  ...  -2.2098e-02 -2.6381e-02  7.0041e-03
                 ...                   ⋱                   ...                
  4.6942e-03  1.1150e-02  7.8494e-03  ...  -2.6003e-02 -2.9039e-02  8.7229e-03
  4.6966e-03  1.1146e-02  7.8501e-03  ...  -2.6006e-02 -2.9041e-02  8.7249e-03
  4.6980e-03  1.1143e-02  7.8504e-03  ...  -2.6008e-02 -2.9042e-02  8.7261e-03

( 1 ,.,.) = 
 -1.1094e-02  2.2234e-02  7.0412e-03  ...  -1.3893e-02 -2.1753e-02  4.0296e-03
 -3.7722e-03  1.8043e-02  6.6307e-03  ...  -1.9105e-02 -2.4647e-02  5.9177e-03
  1.0599e-04  1.5275e-02  6.8597e-03  ...  -2.2098e-02 -2.6381e-02  7.0041e-03
                 ...                   ⋱                   ...                
  4.6942e-03  1.1150e-02  7.8494e-03  ...  -2.6003e-02 -2.9039e-02  8.7229e-03
  4.

In [34]:
rd1 = Variable(torch.rand([2, 3, 4])).cuda()
print rd1
rd2 = rd1.view(6, 4)
print rd2
print rd2.view(2, 3, 4)

Variable containing:
(0 ,.,.) = 
  0.8480  0.6187  0.9019  0.9255
  0.0469  0.8340  0.1802  0.4042
  0.5947  0.9032  0.0727  0.3357

(1 ,.,.) = 
  0.1368  0.6639  0.0902  0.1951
  0.9294  0.4551  0.9758  0.7000
  0.1181  0.3522  0.6663  0.9516
[torch.cuda.FloatTensor of size 2x3x4 (GPU 0)]

Variable containing:
 0.8480  0.6187  0.9019  0.9255
 0.0469  0.8340  0.1802  0.4042
 0.5947  0.9032  0.0727  0.3357
 0.1368  0.6639  0.0902  0.1951
 0.9294  0.4551  0.9758  0.7000
 0.1181  0.3522  0.6663  0.9516
[torch.cuda.FloatTensor of size 6x4 (GPU 0)]

Variable containing:
(0 ,.,.) = 
  0.8480  0.6187  0.9019  0.9255
  0.0469  0.8340  0.1802  0.4042
  0.5947  0.9032  0.0727  0.3357

(1 ,.,.) = 
  0.1368  0.6639  0.0902  0.1951
  0.9294  0.4551  0.9758  0.7000
  0.1181  0.3522  0.6663  0.9516
[torch.cuda.FloatTensor of size 2x3x4 (GPU 0)]

