In [1]:
import numpy as np
T, H = 5, 4
hs = np.random.randn(T, H)
a = np.array([0.8, 0.1, 0.03, 0.05, 0.02])
ar = a.reshape(5, 1).repeat(4, axis=1)
print(ar.shape)
# (5, 4)

t = hs * ar
print(t.shape)

c = np.sum(t, axis=0)
print(c.shape)
print(c)
print(hs)
print(ar)

(5, 4)
(5, 4)
(4,)
[ 0.70978412  1.19391912  1.24670639 -0.19180209]
[[ 0.89124077  1.37139915  1.50709162 -0.4266593 ]
 [ 0.12681235  0.88570584  0.64557336  0.34988996]
 [ 0.08535004  0.98555162 -0.18745466  0.74821566]
 [-0.33845729 -0.20311363 -0.66485504  0.97671252]
 [-0.07636842 -0.5590823   0.76710708  2.16271289]]
[[ 0.8   0.8   0.8   0.8 ]
 [ 0.1   0.1   0.1   0.1 ]
 [ 0.03  0.03  0.03  0.03]
 [ 0.05  0.05  0.05  0.05]
 [ 0.02  0.02  0.02  0.02]]


In [2]:
N, T, H = 10, 5, 4
hs = np.random.randn(N, T, H)
a = np.random.randn(N, T)
ar = a.reshape(N, T, 1).repeat(H, axis=2)
# ブロードキャスト
t = hs * ar
print(t.shape)
# (10, 5, 4)
c = np.sum(t, axis=1)
print(c.shape)

(10, 5, 4)
(10, 4)


In [3]:
class WeightSum:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None
    
    def forward(self, hs, a):
        N, T, H = hs.shape
        
        ar = a.reshape(N, T, 1).repeat(H, axis=2)
        t = hs * ar
        c = np.sum(t, axis=1)
        
        self.chace = (hs, ar)
        return c
    
    def backward(self, ds):
        hs, ar = self.cache
        N, T, H = hs.shape()
        
        dt = dc.reshape(N, 1, H).repeat(T, axis=1) # sumの逆伝播
        dar = dt * hs
        dhs = dt * ar
        da = np.sum(dar, axis=2) # repeatの逆伝播
        
        return dhs, da

In [4]:
from common.layers import Softmax

N, T, H = 10, 5, 4
hs = np.random.randn(N, T, H)
h = np.random.randn(N, H)
hr = h.reshape(N, 1, H)

t = hs * hr
print(t.shape)

s = np.sum(t, axis=2)
print(s.shape)
softmax = Softmax()
a = softmax.forward(s)
print(a.shape)

print(a)

(10, 5, 4)
(10, 5)
(10, 5)
[[ 0.22013462  0.00633999  0.04054231  0.13614591  0.59683717]
 [ 0.55330064  0.05689891  0.1432446   0.18080201  0.06575386]
 [ 0.09105111  0.08228367  0.79487112  0.02920897  0.00258513]
 [ 0.16240749  0.01373793  0.12651259  0.0565714   0.64077059]
 [ 0.28194275  0.10418348  0.19157523  0.29051811  0.13178043]
 [ 0.01171863  0.02437001  0.92569082  0.02133752  0.01688301]
 [ 0.00736686  0.23042797  0.00564731  0.22677427  0.52978358]
 [ 0.22467884  0.36200984  0.34397246  0.02423655  0.0451023 ]
 [ 0.84557276  0.03761131  0.02869949  0.01187723  0.0762392 ]
 [ 0.97313236  0.01205704  0.01099548  0.00194155  0.00187357]]


In [5]:
class AttentionWeight:
    def __init__(self):
        self.params, self.grads =[], []
        self.softmax = Softmax()
        self.cache = None
    
    def forward(self, hs, h):
        N, T, H = hs.shape
        
        hr = h.reshape(N, 1, H).repeat(T, axis=1)
        t = hs * hr
        s = np.sum(t, axis=2)
        a = self.softmax.forward(s)
        
        self.cache = (hs, hr)
        return a
    
    def backward(self, da):
        hs, hr = self.cache
        N, T, H = hs.shape
        
        ds = self.softmax.backward(da)
        dt = ds.reshape(N, T, 1).repeat(H,axis=2)
        dhs = dt * hr
        dhr = dt * hs
        dh = np.sum(dhr, axis=1)
        
        return dhs, dh

In [6]:
class Attention:
    def __init__(self):
        self.params, self.grads = [], []
        self.attention_weight_layer = AttentionWeight()
        self.weight_sum_layer = WeightSum()
        self.attention_weight = None
    
    def forward(self, hs, h):
        a = self.attention_weight_layer.forward(hs, h)
        out = self.weight_sum_layer.forward(hs, a)
        self.attention_weight = a
        return out
    
    def backward(self, dout):
        dhs0, da = self.weight_sum.layer.backward(dout)
        dhs1, dh = self.attention_weight_layer.backward(da)
        dhs = dhs0 + dhs1
        return dhs, dh

In [7]:
class TimeAttention:
    def __init__(self):
        self.params, self.grads =[], []
        self.layers = None
        self.attention_weights = None
        
    def forward(self, hs_enc, hs_dec):
        N, T, H = hs_dec.shape
        out = np.empty_like(hs_dec)
        self.layers = []
        self.attention_weights = []
        
        for t in range(T):
            layer = Attention()
            out[:, t, :] = layer.forward(hs_enc, hs_dec[:, t, :])
            self.layers.append(layer)
            self.attention_weights.append(layer.attention_weights)
            
        return out
    
    def backward(self, dout):
        N, T, H = dout.shape
        dhs_enc = 0
        dhs_dec = np.empty_like(dout)
        
        for t in range(T):
            layer = self.layers[t]
            dhs, dh = layer.backward(dout[:, t, :])
            dhs_enc += dhs
            dhs_dec[:, t, :] = dh
        
        return dhs_enc, dhs_dec
    

In [10]:
from seq2seq import Encoder
class AttentionEncoder(Encoder):
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        return hs
    
    def backward(self, dhs):
        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout
    

In [14]:
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
from attention_seq2seq import AttentionSeq2seq
from seq2seq import Seq2seq

# データの読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('date.txt')
char_to_id, id_to_char = sequence.get_vocab()

# 入力文を反転
x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]

# ハイパーパラメーターの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 256
batch_size = 128
max_epoch = 10
max_grad = 5.0

model = AttentionSeq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)
acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)
    
    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10 
        correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose, is_reverse=True)
        
        acc = float(correct_num) / len(x_test)
        acc_list.append(acc)
    print('val acc'.format(acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 4.08
| epoch 1 |  iter 21 / 351 | time 6[s] | loss 3.09
| epoch 1 |  iter 41 / 351 | time 13[s] | loss 1.90
| epoch 1 |  iter 61 / 351 | time 18[s] | loss 1.72
| epoch 1 |  iter 81 / 351 | time 25[s] | loss 1.46
| epoch 1 |  iter 101 / 351 | time 31[s] | loss 1.19
| epoch 1 |  iter 121 / 351 | time 36[s] | loss 1.14
| epoch 1 |  iter 141 / 351 | time 42[s] | loss 1.09
| epoch 1 |  iter 161 / 351 | time 48[s] | loss 1.06
| epoch 1 |  iter 181 / 351 | time 54[s] | loss 1.04
| epoch 1 |  iter 201 / 351 | time 60[s] | loss 1.03
| epoch 1 |  iter 221 / 351 | time 65[s] | loss 1.02
| epoch 1 |  iter 241 / 351 | time 71[s] | loss 1.02
| epoch 1 |  iter 261 / 351 | time 77[s] | loss 1.01
| epoch 1 |  iter 281 / 351 | time 83[s] | loss 1.00
| epoch 1 |  iter 301 / 351 | time 88[s] | loss 1.00
| epoch 1 |  iter 321 / 351 | time 94[s] | loss 1.00
| epoch 1 |  iter 341 / 351 | time 100[s] | loss 1.00
Q 10/15/94                     
T 1994-10-15
[91m☒[

val acc
| epoch 6 |  iter 1 / 351 | time 0[s] | loss 0.00
| epoch 6 |  iter 21 / 351 | time 6[s] | loss 0.00
| epoch 6 |  iter 41 / 351 | time 11[s] | loss 0.00
| epoch 6 |  iter 61 / 351 | time 17[s] | loss 0.00
| epoch 6 |  iter 81 / 351 | time 23[s] | loss 0.00
| epoch 6 |  iter 101 / 351 | time 29[s] | loss 0.00
| epoch 6 |  iter 121 / 351 | time 34[s] | loss 0.00
| epoch 6 |  iter 141 / 351 | time 40[s] | loss 0.00
| epoch 6 |  iter 161 / 351 | time 46[s] | loss 0.00
| epoch 6 |  iter 181 / 351 | time 52[s] | loss 0.00
| epoch 6 |  iter 201 / 351 | time 57[s] | loss 0.00
| epoch 6 |  iter 221 / 351 | time 63[s] | loss 0.00
| epoch 6 |  iter 241 / 351 | time 69[s] | loss 0.00
| epoch 6 |  iter 261 / 351 | time 75[s] | loss 0.00
| epoch 6 |  iter 281 / 351 | time 80[s] | loss 0.00
| epoch 6 |  iter 301 / 351 | time 86[s] | loss 0.00
| epoch 6 |  iter 321 / 351 | time 92[s] | loss 0.00
| epoch 6 |  iter 341 / 351 | time 98[s] | loss 0.00
Q 10/15/94                     
T 1994-10-15


val acc
