# LSTM FORWARD PROPAGATION CODE WITH EQUATIONS

Credits:<br>
https://datascience-enthusiast.com/DL/Building_a_Recurrent_Neural_Network-Step_by_Step_v1.html

![LSTM Cell](./images/lstm_cell.png)
![LSTM Cell Forward](./images/lstm_cell_forward.png)

# LSTM CELL FORWARD PROPAGATION FOR ONE CELL

In [45]:
import numpy as np
import torch
import math
np.random.seed(1)

In [46]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [47]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """
    Impliments a single forward step of LSTM-cell as described in the fig above
    
    Arguments:
    xt -- input data at timesteps "t", numpy array of shape(n_x,m)
    a_prev -- hidden state at timestep "t-1", numpy array of shape(n_a,m)
    c_prev -- Memory state at timestep "t-1", numpy array of shape(n_a,m)
    
    parameters -- python dict containing:
                    Wf -- Weights matrix of the forgate gate, numpy array of shape(n_a,n_a+n_x)
                    bf -- Bias of the forgate gate, numpy array of shape (n_a,1)
                    Wi -- Weights matrix of the update gate, numpy array of shape(n_a,n_a+n_x)
                    bi -- Bias of the update gate, numpy array of shape (n_a,1)
                    Wc -- Weights matrix of the first "tanh", numpy array of shape(n_a,n_a+n_x)
                    bc -- Bias of the first "tanh", numpy array of shape (n_a,1)
                    Wo -- Weights matrix of the output gate, numpy array of shape(n_a,n_a+n_x)
                    bo -- Bias of the output gate, numpy array of shape (n_a,1)
                    Wf -- Weights matrix relating the hidden state to the output, numpy array of shape(n_y,n_a)
                    bf -- Bias relating the hidden state to the output, numpy array of shape (n_y,1)
    Returns:
    a_next -- next hidden state, of shape(n_a,m)
    c_next -- next memory state, of shape (n_a,m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y,m)
    cache -- tuple of values needed for the backward pass, contains (a_next, c_next, a_prev, c_prev, xt, parameters)
    
    Note: ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilde),
          c stands for the memory value
    """
    # Retrieve parameters from "parameters"
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wi = parameters["Wi"]
    bi = parameters["bi"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]

    # Retrieve dimensions from shapes of xt and Wy
    n_x, m = xt.shape
    n_y, n_a = Wy.shape

    ### MAIN CODE HERE ###
    # concatenate a_prev and xt
    concat = np.zeros((n_a + n_x, m))
    concat[:n_a,:] = a_prev
    concat[n_a:,:] = xt
    
    # Compute values for ft, it, cct, c_next, ot, a_next using the formulas shown above
    ft = sigmoid(np.dot(Wf,concat) +bf)
    it = sigmoid(np.dot(Wi,concat) +bi)
    cct = np.tanh(np.dot(Wo,concat) +bc)
    c_next = ft*c_prev + it * cct
    ot = sigmoid(np.dot(Wf,concat) +bf)
    a_next = ot * np.tanh(c_next)
    
    # Compute the prediction of the LSTM cell
    yt_pred = torch.nn.Softmax(np.dot(Wy,a_next) +by)
    yt_pred =  yt_pred.dim
    ### MAIN ENDS HERE ###
    
    # store values needed for backward propagation in cache
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

    return a_next, c_next, yt_pred, cache

In [48]:
m = 10 # number_of_example
input_feature_size = 3 # each word could be represented to a vector of size 3
hidden_size = 5 # hidden layer dimensions
output_size = 2 # how many output you want


xt = np.random.randn(input_feature_size,m)
a_prev = np.random.randn(hidden_size,m)
c_prev = np.random.randn(hidden_size,m)
Wf = np.random.randn(hidden_size,input_feature_size+hidden_size)
bf = np.random.randn(hidden_size,1)
Wi = np.random.randn(hidden_size,input_feature_size+hidden_size)
bi = np.random.randn(hidden_size,1)
Wo = np.random.randn(hidden_size,input_feature_size+hidden_size)
bo = np.random.randn(hidden_size,1)
Wc = np.random.randn(hidden_size,input_feature_size+hidden_size)
bc = np.random.randn(hidden_size,1)
Wy = np.random.randn(output_size,hidden_size)
by = np.random.randn(output_size,1)

parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", c_next.shape)
print("c_next[2] = ", c_next[2])
print("c_next.shape = ", c_next.shape)
print("yt[1] =", yt[1])
print("yt.shape = ", yt.shape)
print("cache[1][3] =", cache[1][3])
print("len(cache) = ", len(cache))


a_next[4] =  [ 6.50250148e-01 -2.74012030e-04 -1.74184734e-04  3.14518225e-02
 -5.77828015e-01 -3.23325131e-02  4.35000746e-01  1.98681487e-01
 -5.99909235e-01  6.14818200e-01]
a_next.shape =  (5, 10)
c_next[2] =  [ 0.57207884  1.35755352  0.44439288  0.14919283 -1.54046493  0.57356263
  0.01422994 -1.78940624 -0.97897515 -0.55744859]
c_next.shape =  (5, 10)
yt[1] = [-0.97273119 -0.96198061 -1.58100467 -1.441182    0.34621894 -0.36082323
 -1.07411669 -0.393978    0.04028089 -1.47979389]
yt.shape =  (2, 10)
cache[1][3] = [-0.17802135  0.10394883  0.58336635  0.46341066  0.01029736 -0.83420601
 -0.07906531 -0.34425587 -0.25242088  1.1182707 ]
len(cache) =  10


# LSTM FULL FORWARD PROPAGATION FULL

![LSTM Cell](./images/lstm_unroll.png)

In [49]:
def lstm_forward(x,a0,parameters):
    """
    Impliments forward propagation of LSTM-cell unrolled in time as described in the fig above
    
    Arguments:
    x -- Input data for all the timesteps, numpy array of shape(n_x,m,T_x)
    a0 -- Initial hidden state, of shape(n_a,m)
    
    parameters -- python dict containing:
                    Wf -- Weights matrix of the forgate gate, numpy array of shape(n_a,n_a+n_x)
                    bf -- Bias of the forgate gate, numpy array of shape (n_a,1)
                    Wi -- Weights matrix of the update gate, numpy array of shape(n_a,n_a+n_x)
                    bi -- Bias of the update gate, numpy array of shape (n_a,1)
                    Wc -- Weights matrix of the first "tanh", numpy array of shape(n_a,n_a+n_x)
                    bc -- Bias of the first "tanh", numpy array of shape (n_a,1)
                    Wo -- Weights matrix of the output gate, numpy array of shape(n_a,n_a+n_x)
                    bo -- Bias of the output gate, numpy array of shape (n_a,1)
                    Wf -- Weights matrix relating the hidden state to the output, numpy array of shape(n_y,n_a)
                    bf -- Bias relating the hidden state to the output, numpy array of shape (n_y,1)
    Returns:
    a -- hidden state for every timestep, of shape(n_a,m,T_x)
    y -- Predictions for every time-step numpy array of shape (n_a,m,T_x)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y,m)
    cache -- tuple of values needed for the backward pass, contains (list of all caches,x)
    
    """
    
    # Initialize "caches" which will track the list of all the caches
    caches = []
    
    ### MAIN CODE HERE ###
    # Retrieve dimensions from shapes of x and Wy
    n_x,m,T_x = x.shape
    n_y,n_a = parameters["Wy"].shape
    
    # Initialize "a", "c" and "y" with zeros
    a = np.zeros((n_a,m,T_x))
    c = a
    y = np.zeros((n_y,m,T_x))
    
    # Initialize a_next and c_next 
    a_next = a0 # initial state of hidden vector
    c_next = np.zeros(a_next.shape) # initial state of context vector
    
    # Loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, next memory state, compute the prediction, get the cahce
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t],a_next,c_next,parameters)
        # save the values of the new "next" hidden state in a
        a[:,:,t] = a_next
        # save the values of pred
        y[:,:,t] = yt
        # save the values of the next cell state
        c[:,:,t] = c_next
        # Append the cache inot caches
        caches.append(cache)
    ### MAIN ENDS HERE ###
    
    #store values needed for the back propagation in cache
    cahces = (caches,x)
    
    return a,y,c,cahces

In [50]:
m = 10 # number_of_example
input_feature_size = 3 # each word could be represented to a vector of size 3
hidden_size = 5 # hidden layer dimensions
output_size = 2 # how many output you want
time_step_size = 7 # time steps


x = np.random.randn(input_feature_size,m,time_step_size)
a0 = np.random.randn(hidden_size,m)
Wf = np.random.randn(hidden_size,input_feature_size+hidden_size)
bf = np.random.randn(hidden_size,1)
Wi = np.random.randn(hidden_size,input_feature_size+hidden_size)
bi = np.random.randn(hidden_size,1)
Wo = np.random.randn(hidden_size,input_feature_size+hidden_size)
bo = np.random.randn(hidden_size,1)
Wc = np.random.randn(hidden_size,input_feature_size+hidden_size)
bc = np.random.randn(hidden_size,1)
Wy = np.random.randn(output_size,hidden_size)
by = np.random.randn(output_size,1)

parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1[1]] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

a[4][3][6] =  0.21437420411361274
a.shape =  (5, 10, 7)
y[1][4][3] = -0.2015587769196316
y.shape =  (2, 10, 7)
caches[1][1[1]] = [ 2.11060505 -1.30653407  0.07638048  0.36723181  1.23289919 -0.42285696
  0.08646441]
c[1][2][1] 0.04794387049025145
len(caches) =  2
