In [2]:
import numpy as np

In [3]:
class Tanh:
    def forward(self, weight_input):
        return 2.0 / (1.0+np.exp(-2*weight_input)) -1.0
    def backward(self, output):
        return 1-output*output

In [4]:
class Sigmoid:
    def forward(self, weight_input):
        return 1.0 / (1.0+np.exp(-weight_input))
    def backward(self, output):
        return output * (1-output)

![LSTM](assert/lstm.png)

In [17]:
a=np.array([[1,2],[4,5],[3,6]])
b=np.array([1,2,3])
print(np.dot(b,a))
np.append(b,b)
b*b


[18 30]


array([1, 4, 9])

In [26]:
a = np.array([[1],[2],[3]])
print(a)
print(a.transpose())
b = np.array([[1,2,3],[4,5,6],[7,8,9]])
np.dot(b,[1,2,3])
np.concatenate((b,b),axis=0)

[[1]
 [2]
 [3]]
[[1 2 3]]


array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9],
       [1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [27]:
class LSTMUnit:
    def __init__(self, input_dim, hidden_dim, learning_rate):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        self.sigmoid = Sigmoid()
        self.tanh = Tanh()
        self.f_weight, self.f_bias = self.init_weight_bias()
        self.i_weight, self.i_bias = self.init_weight_bias()
        self.c_weight, self.c_bias = self.init_weight_bias()
        self.o_weight, self.o_bias = self.init_weight_bias()
        self.f_output = self.init_state_vec()
        self.i_output = self.init_state_vec()
        self.c_output = self.init_state_vec()
        self.o_output = self.init_state_vec()
        self.ct_output = self.init_state_vec()
        self.h_output = self.init_state_vec()
        self.times = 0
    def init_weight_bias(self):
        weight = np.random.uniform(-1e-4, 1e-4, (self.hidden_dim + self.input_dim, self.hidden_dim))
        bias = np.zeros(self.hidden_dim)
        return weight, bias
    def init_state_vec(self):
        output_list = []
        output_list.append(np.zeros(self.hidden_dim))
        return output_list
    def forward(self, x):
        self.times += 1
        fg = self.calc_gate(x, self.f_weight, self.f_bias, self.sigmoid)
        self.f_output.append(fg)
        ig = self.calc_gate(x, self.i_weight, self.i_bias, self.sigmoid)
        self.i_output.append(ig)
        og = self.calc_gate(x, self.o_weight, self.o_bias, self.sigmoid)
        self.o_output.append(og)
        ct = self.calc_gate(x, self.c_weight, self.c_bias, self.tanh)
        self.o_output.append(ct)
        c = fg * self.c_output[self.times - 1] + ig * ct
        self.c_output.append(c)
        h = og * self.tanh.forward(c)
        self.h_output.append(h)
    def calc_gate(self, x, weight, bias, activator):
        h = self.h_output[self.times - 1]
        input = np.append(x, h)
        net = np.dot(input, weight) + bias
        output = activator.forward(net)
        return output
    def backward(self, x, delta_h, activator):
        self.calc_delta(delta_h, activator)
        self.calc_gradient(x)
    def calc_delta(self, delta_h, activator):
        self.delta_h_list = self.init_delta()
        self.delta_o_list = self.init_delta()
        self.delta_i_list = self.init_delta()
        self.delta_f_list = self.init_delta()
        self.delta_ct_list = self.init_delta()
        self.delta_h_list[-1] = delta_h
        for k in range(self.times, 0, -1):
            self.calc_delta_k(k)
    def init_delta(self):
        delta_list = []
        for i in range(self.times + 1):
            delta_list.append(np.zeros(self.hidden_dim))
        return delta_list
    def calc_delta_k(self, k):
        ig = self.i_output[k]
        og = self.o_output[k]
        fg = self.f_output[k]
        ct = self.ct_output[k]
        c = self.c_output[k]
        c_prev = self.c_output[k-1]
        tanh_c = self.tanh.forward(c)
        delta_k = self.delta_h_list[k]
        delta_o = (delta_k * tanh_c * self.sigmoid.backward(og))
        delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.sigmoid.backward(fg))
        delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.sigmoid.backward(ig))
        delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.tanh.backward(ct))
        delta_h_prev = np.dot(self.o_weight[:self.hidden_dim], delta_o) + np.dot(self.i_weight[:self.hidden_dim], delta_i) + np.dot(self.f_weight[:self.hidden_dim], delta_f) + np.dot(self.c_weight[:self.hidden_dim], delta_ct)
        self.delta_h_list[k-1] = delta_h_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct
    def calc_gradient(self, x):
        self.fh_grad, self.fx_grad, self.fb_grad = self.init_weight_bias_gradient()
        self.ih_grad, self.ix_grad, self.ib_grad = self.init_weight_bias_gradient()
        self.oh_grad, self.ox_grad, self.ob_grad = self.init_weight_bias_gradient()
        self.ch_grad, self.cx_grad, self.cb_grad = self.init_weight_bias_gradient()
        for t in range(self.times, 0, -1):
            fh_grad, fb_grad, ih_grad, ib_grad, oh_grad, ob_grad, ch_grad, cb_grad = self.calc_gradient_t(t)
            self.fh_grad += fh_grad
            self.fb += fb_grad
            self.ih_grad += ih_grad
            self.ib += ib_grad
            self.oh_grad += oh_grad
            self.ob += ob_grad
            self.ch_grad += ch_grad
            self.cb += cb_grad
            print("------{}------".format(t))
            print(fh_grad)
            print(self.fh_grad)
        self.fx_grad = self.delta_f_list[-1] * x
        self.ix_grad = self.delta_i_list[-1] * x
        self.ox_grad = self.delta_o_list[-1] * x
        self.cx_grad = self.delta_ct_list[-1] * x
    def init_weight_bias_gradient(self):
        h_grad = np.zeros((self.hidden_dim, self.hidden_dim))
        x_grad = np.zeros((self.input_dim, self.hidden_dim))
        b_grad = np.zeros(self.hidden_dim)
        return h_grad, x_grad, b_grad
    def calc_gradient_t(self, t):
        h_prev = self.h_output[t-1]
        fh_grad = self.delta_f_list[t] * h_prev
        fb_grad = self.delta_f_list[t]
        ih_grad = self.delta_i_list[t] * h_prev
        ib_grad = self.delta_i_list[t]
        oh_grad = self.delta_o_list[t] * h_prev
        ob_grad = self.delta_o_list[t]
        ch_grad = self.delta_ct_list[t] * h_prev
        cb_grad = self.delta_ct_list[t]
        return fh_grad, fb_grad, ih_grad, ib_grad, oh_grad, ob_grad, ch_grad, cb_grad
    def update(self):
        self.f_weight -= self.learning_rate * np.concatenate((self.fh_grad, self.fx_grad), axis=0)
        self.f_bias -= self.learning_rate * self.fb_grad
        self.i_weight -= self.learning_rate * np.concatenate((self.ih_grad, self.ix_grad), axis=0)
        self.i_bias -= self.learning_rate * self.ib_grad
        self.o_weight -= self.learning_rate * np.concatenate((self.oh_grad, self.ox_grad), axis=0)
        self.o_bias -= self.learning_rate * self.ob_grad
        self.c_weight -= self.learning_rate * np.concatenate((self.ch_grad, self.cx_grad), axis=0)
        self.c_bias -= self.learning_rate * self.cb_grad
    def reset_state(self):
        self.times = 0
        self.c_output = self.init_state_vec()
        self.f_output = self.init_state_vec()
        self.h_output = self.init_state_vec()
        self.i_output = self.init_state_vec()
        self.o_output = self.init_state_vec()
        self.ct_output = self.init_state_vec()