In [None]:
import abc

In [None]:
import numpy as np

In [None]:
np.seterr(all='raise', under='warn')

In [None]:
np.set_printoptions(precision=20)

In [None]:
from simple_nn.nn_layer import *
from simple_nn.activation_function import *
from simple_nn.gd_updater import *
from simple_nn.nn_model import *

In [None]:
class ActivationTanh(ActivationFunction):
    
    def __init__(self):
        """Init function. Do nothing."""
        pass
    
    @property
    def derivative_use_activated(self):
        return True
    
    def apply(self, v):
        return np.tanh(v)
    
    def derivative(self, v):
        return 1 - v**2

In [None]:
class ActivationSoftmax(ActivationFunction):
    
    def __init__(self):
        """Init function. Do nothing."""
        pass
    
    @property
    def derivative_use_activated(self):
        return True
    
    def apply(self, v):
        return np.exp(v) / np.sum(np.exp(v), axis=0)
    
    def derivative(self, v):
        return v * (1 - v)

In [None]:
class RNNLayer(NNLayer):
    
    def __init__(self, node_count, bptt_truncate=float("inf"), has_bias=True, activation=ActivationTanh()):
        self.layer_id = None
        self.node_count = node_count
        self.has_bias = has_bias
        self.activation = activation
        self.gd_updater = None
        self.weights = None
        self.bias = None
        self.step_input = None
        self.reduced_sum = None
        self.step_output = None
        self.hidden_state = np.zeros(self.node_count)
        self.bptt_truncate = bptt_truncate
    
    def init(self, input_count, gd_updater):
        self.layer_id = id(self)
        self.gd_updater = gd_updater
        self.weights, self.bias = self.xavier_weight_init(
            input_count+self.node_count, self.node_count, self.has_bias
        )
        self.gd_updater.register_layer(self.layer_id, self.weights.shape, self.bias.shape)
    
    def forward(self, step_input, learning=True, reset_state=True):
        if reset_state:
            self.hidden_state = np.zeros(self.node_count)
        actual_input = np.zeros((len(step_inputs),self.node_count+len(step_inputs[0])))
        reduced_sum = np.zeros((len(step_inputs),self.node_count))
        step_output = np.zeros((len(step_inputs),self.node_count))
        for idx, single_input in enumerate(step_input):
            actual_input[idx] = np.concatenate((single_input, self.hidden_state), axis=1)
            reduced_sum[idx] = np.dot(t, self.weights) + self.bias
            step_output[idx] = self.activation.apply(reduced_sum[idx])
            self.hidden_state = step_output[-1]
        if learning:
            self.step_input = actual_input
            self.reduced_sum = reduced_sum
            self.step_output = step_output
        return step_output
    
    # TODO: Need carefully check and test
    def back_propagation(self, prev_delta, activation_derivatived=False):
        cur_delta = None
        if activation_derivatived:
            cur_delta = prev_delta
        elif self.activation.derivative_use_activated:
            cur_delta = prev_delta * self.activation.derivative(self.step_output)
        else:
            cur_delta = prev_delta * self.activation.derivative(self.reduced_sum)
        step_input = self.step_input
        weight_grads, bias_grads = 0, 0
        # prev_delta = np.zeros((len(step_inputs),self.node_count))
        
        # BPTT?
        for i in range(min(self.bptt_truncate, len(self.step_output))):
            step_input = step_input[:len(cur_delta)-i]
            
            weight_grads += np.dot(np.atleast_2d(step_input).T, cur_delta)
            bias_grads += cur_delta.sum(axis=0)
            
            cur_delta = np.dot(cur_delta, self.weights.T)
            # TODO: confirmation: should i average delta here for all steps against single input?
            prev_delta[:len(cur_delta)-i] = \
                (prev_delta[:len(cur_delta)-i] * (i-1) + cur_delta[:,:len(step_inputs[0])]) / i
            cur_delta = \
                (cur_delta * self.activation.derivative(self.reduced_sum[:len(cur_delta)-i]))[1:, len(step_inputs[0]):]
        
        # TODO: confirmation: should i use mean or sum for gradients?
        self.weight_grads = weight_grads / sum(range(len(cur_delta), len(cur_delta)-i, -1))
        self.bias_grads = bias_grads / sum(range(len(cur_delta), len(cur_delta)-i, -1))
        return prev_delta
    
    def update(self):
        weight_deltas, bias_deltas = \
            self.gd_updater.apply(self.layer_id, self.weight_grads, self.bias_grads)
        self.weights -= weight_deltas
        if self.has_bias:
            self.bias -= bias_deltas