In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Data Generation

In [17]:
fibo = [1, 1]
for _ in range(5):
    fibo.append(fibo[-1] + fibo[-2])
print(fibo)

[1, 1, 2, 3, 5, 8, 13]


In [18]:
raw_data = pd.DataFrame(fibo, columns=["y"])
raw_data

Unnamed: 0,y
0,1
1,1
2,2
3,3
4,5
5,8
6,13


In [19]:
for i in range(1, 4):
    raw_data["x_" + str(i)] = raw_data["y"].shift(i)

data = raw_data.dropna(axis=0)

In [20]:
raw_data

Unnamed: 0,y,x_1,x_2,x_3
0,1,,,
1,1,1.0,,
2,2,1.0,1.0,
3,3,2.0,1.0,1.0
4,5,3.0,2.0,1.0
5,8,5.0,3.0,2.0
6,13,8.0,5.0,3.0


In [21]:
data

Unnamed: 0,y,x_1,x_2,x_3
3,3,2.0,1.0,1.0
4,5,3.0,2.0,1.0
5,8,5.0,3.0,2.0
6,13,8.0,5.0,3.0


In [22]:
X, y = data.iloc[:, 1:].values, data.iloc[:, 0].values
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X=X, y=y)
X_train

array([[0.        , 0.        , 0.        ],
       [0.16666667, 0.25      , 0.        ],
       [0.5       , 0.5       , 0.5       ],
       [1.        , 1.        , 1.        ]])

# RNN

\begin{equation}
h_t = g\left(Wx_t + Uh_{t-1}\right)
\end{equation}

where $x_t$ is the (external) m-dimensional input vector at time $t$
, $ℎ_{t-1}$ the n-dimensional hidden state, $g$ is the (point-wise)
activation function and $W, U$ are the appropriately sized
weight matrices (for simplicity we omit the bias-vectors). 
Specifically, in this case, $W$ is an $n \times m$ matrix and $U$ is an $n \times n$ matrix.
(cp Dey/Salem)

Additionally, for the net to make sense for our problem, we will add a fully connected output layer $f(h_t)$:

\begin{equation}
f(h_t) = w^\prime h_t, 
\end{equation}

where $w$ is a $n \times 1$ vector of weights. Also, for our intents and purposes, we will use the $L_2$ norm as our loss function.

\begin{equation}
loss = \frac{1}{2}\left(y - \hat{y}\right)^\prime \left(y - \hat{y}\right).
\end{equation}

## forward pass

* $n$ number of neurons
* $m$ dimension of $x_t$

In [23]:
n = 3
m = 1

W = np.random.normal(size=(n, m))
U = np.random.normal(size=(n, n))
w = np.random.normal(size=(n, 1))

N, M = data.shape

In [8]:

def forward_RNN(W, U):
    for i in range(N):
        hidden = []
        h_ = 0
        for j in range(M):
            x_t = X_train[i, j]
            h_ = np.tanh(W.dot(x_t) + U.dot(h_))
            hidden.append(h_)

    # Transform back:
    y_ = h_ * (89 - 1) + 1
    y_

    return y_, hidden


y_, hidden = forward_RNN(W, U)
print(y_)

[ 1.          1.0289721   0.81056544  0.62113218  0.21330691 -0.38388743
 -1.38858041 -2.98903666 -5.5878424 ]


In [12]:
hidden = []
h_ = 0

for i in range(3):
    x_t = X_train[:, i].reshape(-1, 1)
    h_ = np.tanh(W.dot(x_t) + U.dot(h_))
    hidden.append(h_)

ValueError: shapes (3,1) and (9,1) not aligned: 1 (dim 1) != 9 (dim 0)

In [16]:
W.dot(x_t.reshape(-1, 1))

ValueError: shapes (3,1) and (9,1) not aligned: 1 (dim 1) != 9 (dim 0)

## backward pass

In [9]:

def backward_RNN(y_, hidden, W, U):
    dLdy_ = -(y - y_)
    dy_dh = 88
    dh_dh_1 = 1

    # Backpropagation through time
    for i in range(3):
        x_t = X_train[:, i]
        dh_dh_1 *= (1-np.tanh(W.dot(x_t + U.dot([hidden[i]])))**2).dot(U)

    dhdW = (1-np.tanh(W.dot(X[:, 0]))**2).dot(X[:, 0])
    dhdU = (1-np.tanh(W.dot(X[:, 0]))**2).dot(hidden[0])

    # Gradients W
    grad_W = (dLdy_ * dy_dh * dh_dh_1 * dhdW).sum()

    # Gradients U
    grad_U = (dLdy_ * dy_dh * dh_dh_1 * dhdU).sum()

    # Gradient Descent
    W = W - 0.001 * grad_W
    U = U - 0.001 * grad_U

    return W, U


W, U = backward_RNN(y_, hidden, W, U)

# LSTM
## forward pass

# GRU