<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [2]:
import numpy as np
from dataset import sequence

(x_train, t_train), (x_test, t_test) = sequence.load_data('date.txt')
char_to_id, id_to_char = sequence.get_vocab()

In [3]:
# Time Key Layer = embedding layer + key layer
from common.time_layers import TimeEmbedding
from common.time_layers import TimeAffine


class TimeKeyLayer:
    def __init__(self, V, D, d):
        """
        Args:
            V: Vocabulary size
            D: Embedding layer(word2vec) 차원 수
            d: Key 차원 수
        """
        # 파라미터 정의
        nr = np.random.rand
        W_embed = nr(V, D)
        W_key = nr(D, d)
        b_key = np.zeros(d)
        
        # 계층 정의
        embed_layer = TimeEmbedding(W_embed)
        key_layer = TimeAffine(W_key, b_key)
        self.layers = [embed_layer, key_layer]
        # 파라미터 취합
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads
            
        
    def forward(self, xs):
        for layer in self.layers:
            xs = layer.forward(xs)
        return xs
    
    
    def backward(self, dout):
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

In [4]:
# Time Query layer
from common.time_layers import TimeAffine

class TimeQueryLayer:
    def __init__(self, V, d):
        """
        Args:
            V: Vocabulary size
            d: Query 차원 수
        """
        self.V = V
        # 파라미터 정의
        nr = np.random.rand
        W_query = nr(V, d)
        b_query = np.zeros(d)
        # 계층 정의
        self.layers = TimeAffine(W_query, b_query)
        # 파라미터 취합
        self.params, self.grads = self.layers.params, self.layers.grads
        
    def forward(self, xs):
        # 레이블인코딩 -> 원핫인코딩
        if xs.ndim == 2:
            xs = np.eye(self.V)[xs]   
        xs = self.layers.forward(xs)
        return xs
    
    def backward(self, dout):
        dout = self.layers.backward(dout)
        return dout
    
    
class TimeValueLayer:
    def __init__(self, V, d):
        """
        Args:
            V: Vocabulary size
            d: Query 차원 수
        """
        self.V = V
        # 파라미터 정의
        nr = np.random.rand
        W_value = nr(V, d)
        b_value = np.zeros(d)
        # 계층 정의
        self.layers = TimeAffine(W_value, b_value)
        # 파라미터 취합
        self.params, self.grads = self.layers.params, self.layers.grads
        
    def forward(self, xs):
        if xs.ndim == 2:
            xs = np.eye(self.V)[xs]
        xs = self.layers.forward(xs)
        return xs
    
    def backward(self, dout):
        dout = self.layers.backward(dout)
        return dout  # dvalue를 의미

In [5]:
# Key, Query를 입력으로 받는 Compatibility Function 수행
from common.functions import softmax

class CompatibilityFunction:
    def __init__(self, V, D, d):
        """
        Args:
            V: Vocabulary size
            D: Embedding layer(word2vec) 차원 수
            d: Query 차원 수
        """
        # Embedding Layer + Key Layer
        self.time_key_layer = TimeKeyLayer(V, D, d)
        # Query Layer
        self.time_query_layer = TimeQueryLayer(V, d)
        # 파라미터 취합
        self.key_params, self.query_params = [], []
        self.key_grads, self.query_grads = [], []
        
        self.key_params += self.time_key_layer.params
        self.key_grads += self.time_key_layer.grads
        
        self.query_params += self.time_query_layer.params
        self.query_grads += self.time_query_layer.grads
        
    
    def forward(self, xs):
        out_query = self.time_query_layer.forward(xs)
        out_query_T = np.transpose(out_query, (0, 2, 1))
        out_key = self.time_key_layer.forward(xs)
        out_key_T = np.transpose(out_key, (0, 2, 1))
        
        out = np.matmul(out_query, out_key_T) / np.sqrt(d)
        score = softmax(out)
        
        self.out_key, self.out_query_T = out_key, out_query_T
        self.out, self.score = out, score
        return score
        
        
    def backward(self, dscore):
        # Softmax 역전파
        dx = self.score * dscore
        sumdx = np.sum(dx, axis=2, keepdims=True)
        dx -= self.score * sumdx
        # Scaling factor(나누기) 역전파
        dx *= -(self.out ** 2)
        # 행렬 곱 역전파
        dquery = np.matmul(dx, self.out_key)
        dkey = np.matmul(self.out_query_T, dx)
        
        # Key Layer
        self.time_key_layer.backward(dkey)
        # Query Layer
        self.time_query_layer.backward(dquery)
        return

In [6]:
# Weighted Sum 계층
class WeightedValue:
    def __init__(self, V, d):
        """
        Args:
            V: Vocabulary size
            D: Embedding layer(word2vec) 차원 수
            d: Value 차원 수
        """ 
        # Value Layer
        self.time_value_layer = TimeValueLayer(V, d)
        # 파라미터 취합
        self.params = [self.time_value_layer.params]
        self.grads = [self.time_value_layer.grads]

    def forward(self, xs, score):
        """
        Args:
            xs: 입력 시퀀스
            score: Compatibility Function이 내뱉는 Weight 값
        """
        # 우선 Value의 한번 Linear 연산 계산!
        out_value = self.time_value_layer.forward(xs)
        
        # Value에 Weighted Sum!
        B, T, d = out_value.shape
        
        Z = np.zeros((B, T, d))
        for b in range(B):
            for t1 in range(T):
                z = np.zeros((1, d))
                for t2 in range(T):
                    v = out_value[b, t2, :]
                    w = score[b, t1, t2]
                    z += (v * w)
                Z[b, t1] = z
        
        self.out_value_T = np.transpose(out_value, (0, 2, 1))
        self.out_value = out_value
        self.score = score
        self.grads.append(np.zeros_like(score))  # 향후 수정 필요..
        
        return Z
    
    def backward(self, dZ):
        """
        Args:
            dZ: Z에 대한 기울기 - shape:(10, 29, 64) 
        """
        out_value_T = self.out_value_T
        
        # Compatibility Function으로 들어갈 역전파 기울기
        dscore = np.matmul(dZ, out_value_T)
        # Value로 들어갈 역전파 기울기
        out_value = self.out_value
        dvalue = np.zeros_like(out_value)
        score = self.score
        
        B, T, d = out_value.shape
        for b in range(B):
            for t1 in range(T):
                v = dvalue[b, t1, :]
                for t2 in range(T):
                    w = score[b, t2, t1]
                    v += (v * w)
                dvalue[b, t1, :] = v
        return dscore, dvalue

In [8]:
V = len(char_to_id)
D = 512
d = 64
xs = x_train[:10]

# Compatibility Function 순전파
comp_func = CompatibilityFunction(V, D, d)
score = comp_func.forward(xs)

# Weighted Sum 순전파
weight_func = WeightedValue(V ,d)
Z = weight_func.forward(xs, score)

############################################################

# Weighted Sum 역전파
dZ = np.ones_like(Z)
dscore, dvalue = weight_func.backward(dZ)

# Compatibility Function 역전파
_ = comp_func.backward(dscore)


print('score shape:', score.shape)
print('dscore shape:', dscore.shape)
print()
print('value shape:', xs.shape)
print('dvalue shape:', dvalue.shape)
print()
print('Z shape:', Z.shape)
print('dZ shape(현재는 임의로 생성):', dZ.shape)

score shape: (10, 29, 29)
dscore shape: (10, 29, 29)

value shape: (10, 29)
dvalue shape: (10, 29, 64)

Z shape: (10, 29, 64)
dZ shape(현재는 임의로 생성): (10, 29, 64)


In [None]:
# 계층 클래스를 만들 때, init 파라미터로 Weight, Bias를 넣도록 수정하자.. 그래야 param, grads 기록하기가 편할듯!? 밑시딥 책처럼..