# スクラッチによるSimpleRNNの実装

In [11]:
def grad_check(forward, backvalue):

#params forward: forward function
#params backward: gradient function

    epsilon = 0.0001
    gradient_checker = (forward(x + epsilon) -forward(x - epsilon)) / (2 * epsilon)
    diff = np.abs(backvalue - gradient_checker)

    print(diff)

In [12]:
import numpy as np

class MultiplyGate:
    def __init__(self):
        
        
    def forward(self,W, x):
        return np.dot(W, x)

    def backward(self, W, x, dz):
        dW = np.asarray(np.dot(dz, x.T))
        dx = np.dot(np.transpose(W), dz)
        return dW, dx

class AddGate:
    def forward(self, x1, x2):
        return x1 + x2

    def backward(self, x1, x2, dz):
        dx1 = dz * np.ones_like(x1)
        dx2 = dz * np.ones_like(x2)
        return dx1, dx2

class AdaGate:
    def forward(self,x1, x2):
        return x1 * x2
    
    def backward(self, x1, x2, dz):
        dx1 = dz * x2
        dx2 = dz * x1
        return dx1, dx2

In [13]:
class Softmax:
    def predict(self, x):
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores,axis=0)

    def loss(self, x, y):
        probs = self.predict(x)
        return -np.log(probs[y])

    def diff(self, x, y):
        probs = self.predict(x)
        probs[y] -= 1.0
        return probs

In [14]:
class Tanh:
    def forward(self, x):
        return np.tanh(x)

    def backward(self, x, top_diff):
        output = self.forward(x)
        return (1.0 - np.square(output)) * top_diff

In [18]:
mulGate = MultiplyGate()
addGate = AddGate()
activation = Tanh()

class L:
    def forward(self, x, prev_s, U, W, V):
        self.mulu = mulGate.forward(U, x)
        self.mulw = mulGate.forward(W, prev_s)
        self.add = addGate.forward(self.mulw, self.mulu)
        self.s = activation.forward(self.add)  
        return self.add, self.s
    
    def backward(self, y, x, A, S, V, W, pred, num):
        
        diffs = pred - y
        # dV
        dV = np.dot(diffs , S[-1].T)
        # dSt
        pre_dSt = np.dot(V.T, diffs)
        dU = 0
        dW = 0
        start = x.shape[1]-1
        for i in range(start,start-num,-1):
            #dAi
            tanh = Tanh()
            #print((1.0-np.square(A[i][5,6]))*pre_dSt[5,6]) 直接計算と、tanh backwardでは小数点以下８桁目あたりからずれることがある
            dA = tanh.backward(A[i], pre_dSt)
            #dU
            dU += np.dot(dA, x[:,i,:])
            #dW
            dW += np.dot(dA, S[i-1].T)
            # dSi-1
            pre_dSt = np.dot(W.T, dA)      
            
        return dV, dU, dW        

In [27]:
from datetime import datetime
import sys

class RNN:
    def __init__(self, word_dim, hidden_dim=100, truncate=4, batchsize=32, optimizer = 'sgd'):
        self.optimizer = optimizer
        self.batchsize = batchsize
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.truncate = truncate
        self.U = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, word_dim))
        self.W = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, hidden_dim))
        self.V = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (2, hidden_dim))
        if self.optimizer == 'adam':
#             self.U = np.random.randn(hidden_dim, word_dim)
#             self.W = np.random.randn(hidden_dim, hidden_dim)
#             self.V = np.random.randn(2, hidden_dim)
            self.m_U = np.zeros_like(self.U)
            self.v_U = np.zeros_like(self.U)
            self.m_W = np.zeros_like(self.W)
            self.v_W = np.zeros_like(self.W)
            self.m_V = np.zeros_like(self.V)
            self.v_V = np.zeros_like(self.V)
            self.beta1 = 0.9
            self.beta2 = 0.999
            self.adam_lr = 0.001
#         else:
#             None
        self.optimizer = optimizer

    def forward_propagation(self, x):   
        T = x.shape[1]
        self.all_A = []
        self.all_S = []
        prev_s = np.zeros((hidden_dim, x.shape[0]))
        A = np.zeros((hidden_dim, x.shape[0]))
        S = np.zeros((hidden_dim, x.shape[0]))
        epsilon = 0.0001
        dU_c = 0
        dW_c = 0
        dV_c = 0
        for t in range(T):
            layer = L()
            A, S = layer.forward(x[:,t,:].T, prev_s, self.U, self.W, self.V)
            
#             if (T-self.truncate)<=t:
#                 #grad_caheck
#                  #dU
#                 dU_c += (layer.forward(x[:,t,:].T, prev_s, self.U+epsilon, self.W, self.V) - \
#                         layer.forward(x[:,t,:].T, prev_s, self.U-epsilon, self.W, self.V)) / (2 * epsilon)
#                 #dW
#                 dW_c += (layer.forward(x[:,t,:].T, prev_s, self.U, self.W+epsilon, self.V) - \
#                         layer.forward(x[:,t,:].T, prev_s, self.U, self.W-epsilon, self.V)) / (2 * epsilon)
            
            prev_s = S
            self.all_A.append(A)
            self.all_S.append(S)     
#         #dV
#         dV_c = (layer.forward(x[:,T-1,:].T, prev_s, self.U, self.W, self.V+epsilon) - \
#                 layer.forward(x[:,T-1,:].T, prev_s, self.U, self.W, self.V-epsilon)) / (2 * epsilon)
        #print("dU_c,dW_c,dV_c",dU_c,dW_c,dV_c)
        return np.dot(self.V, S)
            
    def loss_and_acc(self, x_train, y_train, x_test, y_test):
        #forward
        train_out = self.forward_propagation(x_train)
        test_out = self.forward_propagation(x_test) 
        #predict
        Soft = Softmax()
        train_predict = Soft.predict(train_out)
        test_predict = Soft.predict(test_out)
        #acc
        #0.５以上の方を採用する
        train_predict2 = train_predict.copy()
        test_predict2 = test_predict.copy()
        train_predict2[train_predict2<0.5]=0
        test_predict2[test_predict2<0.5]=0
        train_predict2[0.5<=train_predict2]=1
        test_predict2[0.5<=test_predict2]=1
        self.train_acc = (train_predict2 * y_train).sum() / y_train.shape[1]
        self.test_acc = (test_predict2 * y_test).sum() / y_test.shape[1]
        #loss
        train_log = train_predict * y_train # 正解側の確率だけ残す
        self.train_loss = -np.log(train_log[np.where(1e-9<train_log)] +1e-8).sum() / y_train.shape[1]
        test_log = test_predict * y_test # 正解側の確率だけ残す    
        self.test_loss = -np.log(test_log[np.where(1e-9<test_log)] +1e-8).sum() / y_test.shape[1]
    

    def update(self, dV, dU, dW, learning_rate):
        if self.optimizer == 'sgd':
            self.U -= learning_rate * dU 
            self.V -= learning_rate * dV 
            self.W -= learning_rate * dW 
            
        elif self.optimizer == 'adam':
            
            self.m_U = self.beta1 * self.m_U + (1- self.beta1) * dU
            self.v_U = self.beta2 * self.v_U + (1- self.beta2) * (dU * dU)
            m_hat_U = self.m_U / (1 - self.beta1)
            v_hat_U = self.v_U / (1 - self.beta2)
            self.U -= self.adam_lr * m_hat_U / (np.sqrt(v_hat_U) + 1e-8)
            
            self.m_V = self.beta1 * self.m_V + (1- self.beta1) * dV
            self.v_V = self.beta2 * self.v_V + (1- self.beta2) * (dV * dV)
            m_hat_V = self.m_V / (1 - self.beta1)
            v_hat_V = self.v_V / (1 - self.beta2)
            self.V -= self.adam_lr * m_hat_V / (np.sqrt(v_hat_V) + 1e-8)
            
            self.m_W = self.beta1 * self.m_W + (1- self.beta1) * dW
            self.v_W = self.beta2 * self.v_W + (1- self.beta2) * (dW * dW)
            m_hat_W = self.m_W / (1 - self.beta1)
            v_hat_W = self.v_W / (1 - self.beta2)
            self.W -= self.adam_lr * m_hat_W / (np.sqrt(v_hat_W) + 1e-8) 
            
        else:
            None          

    def trains(self, Xtrain, Ytrain, Xtest, Ytest, learning_rate=0.005, nepoch=100):
        iteration = len(Ytrain) // self.batchsize
        y_hot = np.zeros((2,self.batchsize))  
        pred1 = np.zeros((2,self.batchsize))
        #Yrain ,Ytestをone-hot化しておく
        Ytrain_hot = np.identity(2)[Ytrain].T
        Ytest_hot = np.identity(2)[Ytest].T
        Soft = Softmax()
        #print("epoch",nepoch)
        #print("iteration",iteration)
        for epoch in range(nepoch):
            for itr in range(iteration):
                #print(itr)
                start = itr*batchsize
                x_batch = Xtrain[start:start+self.batchsize]
                y_batch = Ytrain[start:start+self.batchsize]
                y_hot = np.identity(2)[y_batch].T# (2, batchsize) 0行目が０ユニット用、1行目が１ユニット用
                # forward
                output = self.forward_propagation(x_batch)
                #predict
                pred1 = Soft.predict(output)
                layers = L()
                dV, dU, dW = layers.backward(y_hot, x_batch, self.all_A, self.all_S, self.V,  self.W, pred1, self.truncate)
                
                #update
                self.update(dV, dU, dW, learning_rate)
            
            # loss
            if (epoch==0) or ((epoch % 10) == 9):   
                self.loss_and_acc(Xtrain, Ytrain_hot, Xtest, Ytest_hot)
                print("*"*50)
                print("epoch",epoch+1)
                print("train : loss {:.4} ,acc {:.4}".format(self.train_loss, self.train_acc))
                print("test : loss {:.4} ,acc {:.4}".format(self.test_loss, self.test_acc))
                #print("output 0ユニット、１ユニット",output[0,9],output[1,9])
                #print("pred1 0ユニット、１ユニット",pred1[0,9],pred1[1,9])
                #print(self.V)
        

## 学習

In [32]:
import numpy as np
from keras.datasets import imdb

word_dim = 128
hidden_dim = 32
bptt_truncate = 4
batchsize = 16
train_size =10000
test_size=5000
learning_rate=0.001
nepoch=100
np.random.seed(10) #シード固定

# データ
x_train, y_train, x_test, y_test=load_imdb(max_features = 10000, maxlen = 40)
#モデル構築
rnn = RNN(word_dim, hidden_dim ,bptt_truncate, batchsize, optimizer='adam')
#訓練・検証実施
rnn.trains(x_train[:train_size], y_train[:train_size],x_test[:test_size], y_test[:test_size], learning_rate, nepoch)

load dataset
padding
build model
**************************************************
epoch 1
train : loss 0.6511 ,acc 0.6209
test : loss 0.6596 ,acc 0.6016
**************************************************
epoch 10
train : loss 0.6059 ,acc 0.6683
test : loss 0.6575 ,acc 0.6194
**************************************************
epoch 20
train : loss 0.6026 ,acc 0.675
test : loss 0.6633 ,acc 0.617
**************************************************
epoch 30
train : loss 0.6029 ,acc 0.6745
test : loss 0.6667 ,acc 0.6134
**************************************************
epoch 40
train : loss 0.6039 ,acc 0.6737
test : loss 0.6687 ,acc 0.6102
**************************************************
epoch 50
train : loss 0.6047 ,acc 0.6729
test : loss 0.6697 ,acc 0.6082
**************************************************
epoch 60
train : loss 0.6053 ,acc 0.6731
test : loss 0.6701 ,acc 0.6074
**************************************************
epoch 70
train : loss 0.6056 ,acc 0.6723
test : loss 0.670

### 検証データのaccuracyは約61%