https://github.com/dennybritz/rnn-tutorial-rnnlm/blob/master/RNNLM.ipynb
In this link there is RNN implementation by using numpy arrays only. 

As much as I understood this is implementing the ELman RNN without bias.
Elman link: https://en.wikipedia.org/wiki/Recurrent_neural_network

In [1]:
import numpy as np
import itertools
import operator
from datetime import datetime
import sys

vocabulary_size = 8000


class RNNNumpy():
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        # assign instance variable
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # random initiate the parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))


    def softmax(self, x):
        xt = np.exp(x - np.max(x))
        return xt / np.sum(xt)

    def forward_propagation(self, x):
        # total num of time steps, 5 layers back in time maybe?
        T = 5
        # during forward propagation, save all hidden stages in s, S_t = U .dot x_t + W .dot s_{t-1}
        # we also need the initial state of s, which is set to 0
        # each time step is saved in one row in s，each row in s is s[t] which corresponding to an rnn internal loop time
        s = np.zeros((T+1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # output at each time step saved as o, save them for later use
        o = np.zeros((T, self.word_dim))
        for t in np.arange(T):
            # we are indexing U by x[t]. it is the same as multiplying U with a one-hot vector
#             print(self.U[:, int(x[t])])
            s[t] = np.tanh(self.U[:, int(x[t])] + self.W.dot(s[t-1]))
            o[t] = self.softmax(self.V.dot(s[t]))
        return [o, s]

fake_net = RNNNumpy(vocabulary_size, hidden_dim = 100, bptt_truncate = 4)

## Example forward prop

In [6]:
some_hot_encoded_input = np.zeros(vocabulary_size)
some_hot_encoded_input[2] = 1
result = fake_net.forward_propagation(some_hot_encoded_input)
print(result[0].shape, result[1].shape)

(5, 8000) (6, 100)


# LSTM implemantation forward only

In [15]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def dsigmoid(y):
    return y * (1 - y)


def tanh(x):
    return np.tanh(x)


def dtanh(y):
    return 1 - y * y
X_size = 1299
H_size = 100 # Size of the hidden layer
T_steps = 25 # Number of time steps (length of the sequence) used for training
learning_rate = 1e-1 # Learning rate
weight_sd = 0.1 # Standard deviation of weights for initialization
z_size = H_size + X_size # Size of concatenate(H, X) vector

In [16]:
class Param:
    def __init__(self, name, value):
        self.name = name
        self.v = value #parameter value
        self.d = np.zeros_like(value) #derivative
        self.m = np.zeros_like(value) #momentum for AdaGrad

In [19]:
class Parameters:
    def __init__(self):
        self.W_f = (np.random.randn(H_size, z_size) * weight_sd + 0.5)
        self.b_f = (np.zeros((H_size, 1)))

        self.W_i = (np.random.randn(H_size, z_size) * weight_sd + 0.5)
        self.b_i = (np.zeros((H_size, 1)))

        self.W_C = (np.random.randn(H_size, z_size) * weight_sd)
        self.b_C = (np.zeros((H_size, 1)))

        self.W_o = (np.random.randn(H_size, z_size) * weight_sd + 0.5)
        self.b_o = (np.zeros((H_size, 1)))

        #For final layer to predict the next character
        self.W_v = (np.random.randn(X_size, H_size) * weight_sd)
        self.b_v = (np.zeros((X_size, 1)))
        
    
    def forward(self, x, h_prev, C_prev):
        assert x.shape == (X_size, 1)
        assert h_prev.shape == (H_size, 1)
        assert C_prev.shape == (H_size, 1)

        z = np.row_stack((h_prev, x))
        f = sigmoid(np.dot(self.W_f.v, z) + self.b_f.v)
        i = sigmoid(np.dot(self.W_i.v, z) + self.b_i.v)
        C_bar = tanh(np.dot(self.W_C.v, z) + self.b_C.v)

        C = f * C_prev + i * C_bar
        o = sigmoid(np.dot(self.W_o.v, z) + self.b_o.v)
        h = o * tanh(C)

        v = np.dot(self.W_v.v, h) + self.b_v.v
        y = np.exp(v) / np.sum(np.exp(v)) #softmax

        return z, f, i, C_bar, C, o, h, v, y
    
parameters = Parameters()