In [1]:
import numpy as np
from scipy.special import expit as sigmoid

In [2]:
def forget_gate(x, h, W_fh, B_fh, W_fx, B_fx, previous_cell_state):
    '''
    Forget Gate is a way to selectively forget some of what the Cell State(LTM) has in memory.
    
    The New Event and the previous timestep's Hidden State are summed and then transformed via sigmoid
    to output a probability vector of values in (0,1) to proportionally retrieve states in LTM.
    
    cell_state records LTM and will be transformed to produce new_hidden_state(Output, STM) via element-wise product
    therefore, cell_state(LTM) and hidden_state(STM) will be of the same shape [hidden_size, 1]
    
    forget gate is designed to produce a probability matrix taking effect on cell_state via element-wise product
    therefore output of the forget gate is of the same shape as cell_state [hidden_size, 1]
    '''
    # retrospectively, to transform input to the probability candidates of the output shape
    # W_fx is [hidden_size, input_size]
    # B_fx is [hidden_size, 1]
    forget_eventx = W_fx @ x + B_fx
    # W_fh is [hidden_size, hidden_size] to transform previous hidden state
    # B_fh is [hidden_size, 1]
    forget_hidden = W_fh @ h + B_fh
    # information(candidates transformed from inputs/features and previous hidden states/candidates)
    # is combined here via addition, in fact the matrix in the computation can be concatted
    # so that if there's additional feature information from the input, the candidates from the STM can be enhanced
    # to decide what to retrieve from the LTM
    combined = forget_hidden + forget_eventx
    # then the combined (via addition) candidates are normalised to (0, 1) via sigmoid
    # to be used as probability to forget/use candidates stored in previous cell state
    squarshed = sigmoid(combined)
    # use an element-wise multiplication to forget/reduce the candidates stored in previous cell state
    selected_cell_state = np.multiply(squarshed, previous_cell_state)
    return selected_cell_state

In [4]:
def input_gate(x, h, W_ih, B_ih, W_ix, B_ix, W_lh, B_lh, W_lx, B_lx):
    '''
    The Input Gate has 2 components: 
    - the ignore matrix ([hidden_size, 1] probability matrix)
    - the learnt matrix([hidden_size, 1] feature matrix of values(-1, 1))
    -> They are combined use element-wise product to selectively put values(feature weights) in the learnt matrix.
    
    - the ignore matrix is produced via combined transformation of input and previoius hidden state, output with sigmoid
      - similar to forget gate
    - the learnt matrix is produced via combined transformation of input and previous hidden state, output with tanh
      - tanh output (-1, 1) fits feature matrix to learn both positive/negtive relationships from input data
    
    the effect is only parts of the newly learnt information(features) is put through to be added 
    with selected cell_state from forget gate to update LTM as new cell state
    '''
    ignore_hidden = w_ih @ h + b_ih
    ignore_eventx = w_ix @ x + b_ix
    learn_hidden = w_lh @ h + b_lh
    learn_eventx = w_lx @ h + b_lx
    ignore_matrix = sigmoid(ignore_hidden + ignore_eventx)
    learn_matrix = tanh(learn_hidden + learn_eventx)
    selected_learn = np.multiply(ignore_matrix, learn_matrix)
    return selected_learn

In [5]:
def cell_state(forget_gate_output, input_gate_output):
    return forget_gate_output + input_gate_output

In [7]:
def output_gate(x, h, w_oh, b_ob, w_ox, b_ox, cell_state):
    """
    Output Gate output the carry-forward STM (the new hidden state), a.k.a. the prediction output from the event.
    
    - the new event and previous hidden state (STM) are combined to produce a selection matrix (probability)
    - to selectively activate/retrieve features/weights stored in cell_state(LTM) 
    """
    out_hidden = w_oh @ h + b_oh
    out_eventx = w_ox @ x + b_ox
    out_matrix = sigmoid(out_hidden + out_eventx)
    selected_ltm = np.multiply(out_matrix, cell_state)
    return selected_ltm

In [None]:
from numpy.random import randn


class LSTM:
    def __init__(self, input_size, output_size, hidden_size=64):

        self.hidden_size = hidden_size

        self.Wfh = randn(hidden_size, hidden_size) / 1000
        self.Wih = randn(hidden_size, hidden_size) / 1000
        self.Wlh = randn(hidden_size, hidden_size) / 1000
        self.Woh = randn(hidden_size, hidden_size) / 1000
        self.Wfx = randn(hidden_size, input_size) / 1000
        self.Wix = randn(hidden_size, input_size) / 1000
        self.Wlx = randn(hidden_size, input_size) / 1000
        self.Wox = randn(hidden_size, input_size) / 1000

        self.Bfh = np.zeros((hidden_size, 1))
        self.Bih = np.zeros((hidden_size, 1))
        self.Blh = np.zeros((hidden_size, 1))
        self.Boh = np.zeros((hidden_size, 1))
        self.Bfx = np.zeros((hidden_size, 1))
        self.Bix = np.zeros((hidden_size, 1))
        self.Blx = np.zeros((hidden_size, 1))
        self.Box = np.zeros((hidden_size, 1))

        self.Wyh = randn(hidden_size, input_size) / 1000
        self.Byh = np.zeros((hidden_size, 1))

    def Cell(self, previous_cell_state, previous_hidden_state, event_x):
        h = previous_hidden_state
        x = event_x

        w_fh = self.Wfh
        w_ih = self.Wih
        w_lh = self.Wlh
        w_oh = self.Woh
        w_fx = self.Wfx
        w_ix = self.Wix
        w_lx = self.Wlx
        w_ox = self.Wox

        b_fh = self.Bfh
        b_ih = self.Bih
        b_lh = self.Blh
        b_oh = self.Boh
        b_fx = self.Bfx
        b_ix = self.Bix
        b_lx = self.Blx
        b_ox = self.Box

        selected_cell_state = forget_gate(x, h, w_fh, b_fh, w_fx, b_fx)
        selected_learnt_input = input_gate(
            x, h, w_ih, b_ih, w_ix, b_ix, w_lh, b_lh, w_lx, b_lx
        )
        new_cell_state = cell_state(selected_cell_state, selected_learnt_input)
        new_hidden_state = output_gate(
            x, h, w_oh, b_oh, w_ox, b_ox, previous_cell_state
        )

        return new_hidden_state, new_cell_state

    def forward(self, inputs):
        h = np.zeros((self.hidden_size, 1))
        c = np.zeros((self.hidden_size, 1))

        for i, x in enumerate(inputs):
            h, c = self.Cell(self, c, h, x)

        y = self.Wyh @ h + self.Byh

        return y, h, c
    
    def backprop(self):
        '''
        not implemented here as the derivative calculation can be lengthy
        '''
        pass

In [None]:
'''
In summary, there are two types of weights and biases of 4 sets
- weights/biases to transform previous_hidden_state of [hidden_size, hidden_size]  [hidden_size, 1]
- weights/biases to transform input [hidden_size, input_size], [hidden_size, 1]

Each set of weights have 4 components in 3 different gates, all performaing 'Wgh @ h + Bgh' or 'Wgx @ x + Bgx'
if combined in computation forming a [4*hidden_size, hidden_size] or [4*hidden_size, input_size] matrix 
output a [4*hidden_size, 1] matrix which can be then further splitted back to 4 [hidden_size, 1] matrix

In fact the two sets of weights can also be concatted in computation to form [4*hidden_size, hidden_size + input_size + 1] matrix
product with [hidden_size+input_size+1, 1], output a [4*hidden_size, 1] matrix
which can be cut into 4 matrix and fed into sigmoid/tanh to produce the neuron output 

p.s. let's say neuron is part of a cell here
in fact it would be better to call each nn components "knots" (combination of two neural nets)
to rename 'cell' 'pod' (group of cells/neurons/knots with a specific internal structure)

'''

In [None]:
# reference
#
# https://towardsdatascience.com/the-lstm-reference-card-6163ca98ae87
# https://blog.varunajayasiri.com/numpy_lstm.html












