In [2]:
"""
加法注意の実装

NEURAL MACHINE TRANSLATION
BY JOINTLY LEARNING TO ALIGN AND TRANSLATE

[Dzmitry Bahdanau, sec: Kyunghyun Cho, last: Yoshua Bengio, ICLR 2015, arXiv, 2014/09]

https://bastings.github.io/annotated_encoder_decoder/
"""

%matplotlib inline
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from IPython.core.debugger import set_trace

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [3]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator
        
    def forward(self, src, trg, src_mask, trg_mask, src_lengths, trg_lengths):
        """
        maskされたソースとtarget 列　を取り込み、処理する
        """
        encoder_hidden, encoder_final = self.encode(src, src_mask, src_lengths)
        return self.decode(encoder_hidden, encoder_final, src_mask, trg, trg_mask)
    
    def encode(self, src, src_mask, src_lengths):
        return self.encoder(self.src_embed(src), src_mask, src_lengths)
    
    def decode(self, encoder_hidden, encoder_final, src_mask, trg, trg_mask, decoder_hidden=None):
        return self.decoder(self.trg_embed(trg), encoder_hidden,
                            encoder_final, src_mask, trg_mask, hidden=decoder_hidden)
    

In [4]:
class Generator(nn.Module):
    """
    linear model と softmax を定義する
    fainal output の次元はtarget vocabulary のsize
    """
    def __init__(self, hidden_size, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(hidden_size, vocab_size, bias=False)
        
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [5]:
hidden_size=2
gru = nn.GRU(input_size=3, hidden_size=hidden_size, num_layers=1,bias=False)
input = torch.ones(4,1,3)
h0=torch.ones(1,1,hidden_size)
print('input',input,'\n')
print('h0', h0, '\n')
print('output',gru(input, h0)[0])
print('hidden state',gru(input, h0)[1])
print(gru.weight_hh_l0)
print(gru.weight_ih_l0)

input tensor([[[1., 1., 1.]],

        [[1., 1., 1.]],

        [[1., 1., 1.]],

        [[1., 1., 1.]]]) 

h0 tensor([[[1., 1.]]]) 

output tensor([[[0.8479, 0.4703]],

        [[0.7671, 0.4122]],

        [[0.7190, 0.3987]],

        [[0.6894, 0.3945]]], grad_fn=<CatBackward>)
hidden state tensor([[[0.6894, 0.3945]]], grad_fn=<ViewBackward>)
Parameter containing:
tensor([[-0.3301,  0.1802],
        [-0.3258, -0.0829],
        [-0.2872,  0.4691],
        [-0.5582, -0.3260],
        [-0.1997, -0.4252],
        [ 0.0667, -0.6984]], requires_grad=True)
Parameter containing:
tensor([[ 0.5406,  0.5869, -0.1657],
        [ 0.6496, -0.1549,  0.1427],
        [-0.3443,  0.4153,  0.6233],
        [-0.5188,  0.6146,  0.1323],
        [ 0.5224,  0.0958,  0.3410],
        [-0.0998,  0.5451,  0.1045]], requires_grad=True)


In [126]:
print(gru.weight_ih_l0[0])
print(h0)
print(gru.weight_ih_l0[0])

tensor([-0.2072,  0.7001, -0.0920], grad_fn=<SelectBackward>)
tensor([[[1.],
         [1.]]])
tensor([-0.2072,  0.7001, -0.0920], grad_fn=<SelectBackward>)


In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dopout=dropout)
        
    def forward(self, x, mask, lengths):
        """
        embeddings x に bidirectional GRU を適用する
        input の mini-batch x は lengthでソートされる必要がある
        x は [batch, time, dim] をもつ
        
        pytorch では　pack_padded_sequence, pad_packed_sequence という
        ２つの関数がmask と padding を請け負ってくれる
        pack_padded_sequence -> padされた系列データの各バッチを時刻が早い順にpackしてくれる
        
        length : Tのこと時系列の長さ
        """
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        output, final = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)
        
        # 双方向のfinal hidden statesを結合させる
        
        fwd_final = final[0:final.size(0):2]
        bwd_final = final[1:final.size(0):2]
        final = torch.cat([fwd_final, bwd_final], dim=2) #[num_layers, batch, 2*dim]
        return output, final

In [7]:
# conditional GRU: 隠れ層の第一相をencoderの最終層に使うGRU のこと？

class Decoder(nn.Module):
    def __init__(self, emb_size, hidden_size, attention, num_layers=1, dropout=0.5,,
                bridge=True):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = attention
        self.dropout = dropout
        
        # nn.GRU (input_size, hidden_size, num_layers) input_size:入力の次元数, hidden_size: 隠れ層の次元数, numl_layers: スタック数
        self.rnn = nn.GRU(emb_size + 2*hidden_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        # encoderのfinal hidden層 を initialize する. パラメーターをdecoder用に変換している（引数に注目）
        self.bridge = nn.Linear(2*hidden_size, hidden_size, bias=True) if bridge else None
        
        self.dropout_layer = nn.Dropout(p=dropout)
        self.pre_output_layer = nn.Linear(hidden_size + 2*hidden_size + emb_size, hidden_size, bias=False)
        
    def forward_step(self, prev_embed, encoder_hidden, src_mask, proj_key, hidden):
        """ perfrom a single decoder step (1word) """
        
        # compute context vextor usinf attention mechanism
        query = hidden[-1].unsqueeze(1) # [#layers, B, D] -> [B, 1, D] # decoderの最終層(スタック方向)の隠れ層を取り出してqueryとする
        context, attn_probs = self.attention(
        query=query, proj_key=proj_key,
            value = encoder_hidden, mask=src_mask )
        
        # update rnn hidden state
        # p13 の s˜iの式 #入力系列 [batch_size, seq_length, feature_size] 今 seq_length=1
        rnn_input = torch.cat([prev_embed, context], dim=2)
        output, hidden = self.rnn(rnn_input, hidden)
        
        pre_output = torch.cat([prev_embed, context], dim=2)
        pre_output = self.dropout_layer(pre_output)
        pre_output = self.pre_output_layer(pre_output)
        
        return output, hidden, pre_output
    
    def forward(self, trg_embed, encoder_hidden, encoder_final, src_mask, trg_mask,
                hidden=None, max_len=None):
        
        # RNN の 時系列方向の最大長
        if max_len is None:
            max_len = trg_mask.size(-1)
        
        if hidden is None:
            hidden = self.init_hidden(encoder_final)
            
        proj_key = self.attention.key_layer(encoder_hidden) # encoder_hidden_size を decoder_hidden_sizeにしている
        
        decoder_states = []
        pre_output_vectors = []
        
        # decoder RNN を max_len まで 展開する
        for i in range(max_len):
            prev_embed = trg_embed[:, i].unsqueeze(1) # trg_embed[:, i, :] と同じ [A, 1, B] となる => seq_length=1
            output, hidden, pre_output = self.forward_step(
            prev_embed, encoder_hidden, src_mask, proj_key, hidden)
            decoder_states.append(output)
            pre_output_vectors.append(pre_output)
            
            decoder_states = torch.cat(decder_states, dim=1)
            pre_output_vectors = torch.cat(pre_output_vectors, dim=1)
            return decoder_states, hidden, pre_output_vectors 
            
        def init_hidden(self, encoder_final):
            if encoder_final is None:
                return None 
            
            return torch.tanh(self.bridge(encder_final))
        

SyntaxError: invalid syntax (<ipython-input-7-246db5b39685>, line 4)

In [14]:
class BahdanauAttention(nn.Module):
    """
    Implements Bahdnau(MLP) attention
    """
    def __init__(self, hidden_size, key_size=None, query_size=None):
        super(BahdanauAttention, self).__init__()
        # bi-directional encoder なので key_size = 2 * hidden_size
        key_size = 2 * hidden_size if key_size is None else key_size
        query_size = hidden_size if query_size None else query_size
        
        self.key_layer = nn.Linear(key_size, hidden_size, bias=False)
        self.query_layer = nn.Linear(query_size, hidden_size, bias=False)
        self.energy_layer = nn.Linear(hidden_size, 1, bias=False)
        
        # to store attention score
        self.alpha = None
    
    def forward(self, query=None, proj_key=None, value=None, mask=None):
        assert mask is not None, "mask is required"
        # we first project the query
        query = self.query_layer(query)
        
        # calculate socre
        scores = self.energy_layer(torch.tanh(query + proj_key)) # query: decoderの隠れ層si-1 proj_key: encoder_hiddenをdecoder_hidden_sizeにしたもの
        scores = scores.squeeze(2).unsqueeze(1)
        
        # Mask out invalid positions
        # The mask marks valid positions so we invert it using 'mask & 0'
        scores.data.masked_fill_(mask == 0, -float('inf'))
        
        alphas = F.softmax(scores, dim=-1)
        self.alphas = alphas
        context = torch.bmm(alphas, value) # 重みとエンコーダーの隠れ層の各要素の和＝内積
        
        return context, alphas
        

tensor([-1.2858,  0.4943,  1.2176, -0.8914,  0.7860,  1.0060,  0.8448, -0.1627,
         1.3187,  0.5707])
tensor([[-1.2858,  0.4943,  1.2176, -0.8914,  0.7860,  1.0060,  0.8448, -0.1627,
          1.3187,  0.5707]])


In [59]:
a = torch.randn((1,3,4))
b = torch.randn((1,3,4))
c = torch.cat([a,b],dim=2)
print(a)
print(b)
print(c)
d = torch.randn((10,10))
print(d.shape)
print(d[:,5].shape)
print(d.unsqueeze(1).shape)
print(a[:,1,:].shape)
print(a[:,1].shape)

tensor([[[-1.5065,  0.4956, -0.2470, -1.2075],
         [-0.7373,  1.1051,  0.8750,  1.1536],
         [ 1.8955,  1.4530,  0.6975,  1.6495]]])
tensor([[[ 0.1721,  0.5236, -0.7787,  1.1389],
         [-1.1370,  0.5588,  1.3614, -0.8475],
         [ 1.0006, -0.9063,  0.6109,  0.9280]]])
tensor([[[-1.5065,  0.4956, -0.2470, -1.2075,  0.1721,  0.5236, -0.7787,
           1.1389],
         [-0.7373,  1.1051,  0.8750,  1.1536, -1.1370,  0.5588,  1.3614,
          -0.8475],
         [ 1.8955,  1.4530,  0.6975,  1.6495,  1.0006, -0.9063,  0.6109,
           0.9280]]])
torch.Size([10, 10])
torch.Size([10])
torch.Size([10, 1, 10])
torch.Size([1, 4])
torch.Size([1, 4])


In [60]:
m = nn.Linear(20, 30)
input = torch.randn(128, 20)
output = m(input)
print(output.size())

torch.Size([128, 30])
