Inspiration: https://towardsdatascience.com/recurrent-neural-networks-rnns-3f06d7653a85

# Imports and globals

In [1]:
import numpy as np

# Basic functions

In [2]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

def sigmoid_derivative(sigmoid_output):
    return sigmoid_output * (1 - sigmoid_output)

def softmax_naive(x):
    return np.exp(x) / np.sum(np.exp(x))

def softmax_stable(x):
    p = np.exp(x - np.max(x))
    return p / np.sum(p)

softmax = softmax_stable

softmax(np.array([range(3)]))

array([[0.09003057, 0.24472847, 0.66524096]])

In [20]:
class RNN:
    def __init__(self, input_dim:int, hidden_dim:int, output_dim:int):
        # x, h, and y are all column vectors
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.Wxh = np.random.uniform(0, 1, (hidden_dim, input_dim))
        self.Whh = np.random.uniform(0, 1, (hidden_dim, hidden_dim))
        self.Why = np.random.uniform(0, 1, (output_dim, hidden_dim))
        self.bh = np.random.uniform(0, 1, self.h_shape)
        self.by = np.random.uniform(0, 1, self.y_shape)
        self.params = (self.Wxh, self.Whh, self.Why, self.bh, self.by)

        # For rprop:
        self.dparams_prev = tuple(np.ones_like(param) for param in self.params)
        self.step_multiplers = tuple(np.ones_like(param) for param in self.params)

        self.reset_history()

    @property
    def x_shape(self): return (self.input_dim, 1)
    
    @property
    def h_shape(self): return (self.hidden_dim, 1)

    @property
    def y_shape(self): return (self.output_dim, 1)

    def reset_history(self):
        h = np.zeros(self.h_shape)
        self.h_history = [h]
        self.y_history = []

    @property
    def seq_length(self):
        return len(self.y_history)

    @property
    def h(self):
        return self.h_history[-1]

    @h.setter
    def h(self, value):
        assert value.shape == self.h_shape, (value.shape, self.h_shape)
        self.h_history.append(value)

    @property
    def y(self):
        return self.y_history[-1]

    @y.setter
    def y(self, value):
        assert value.shape == (self.output_dim, 1), (value.shape, self.y_shape)
        self.y_history.append(value)

    def forward_one_step(self, x):
        assert x.shape == self.x_shape, (x.shape, self.x_shape)
        z = np.dot(self.Wxh, x) + np.dot(self.Whh, self.h) + self.bh
        self.h = np.tanh(z)
        o = np.dot(self.Why, self.h) + self.by
        self.y = softmax(o)
        return self.y

    def forward(self, xs):
        return [self.forward_one_step(x) for x in xs]

    def predict_one_step(self, x):
        return np.argmax(self.forward_one_step(x))

    def predict(self, xs):
        return [self.predict_one_step(x) for x in xs]

    def loss(self, targets):
        return sum(-np.log(self.y_history[t][targets[t], 0]) for t in range(self.seq_length))

    def backward(self, xs, targets):
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dWhy = np.zeros_like(self.Why)
        dbh = np.zeros_like(self.bh)
        dby = np.zeros_like(self.by)
        dparams = (dWxh, dWhh, dWhy, dbh, dby)
        dhnext = np.zeros_like(self.h)
        for t in reversed(range(self.seq_length)):
            dy = np.copy(self.y_history[t])
            dy[targets[t]] -= 1
            dWhy += np.dot(dy, self.h_history[t].T)
            dby += dby
            dh = np.dot(self.Why.T, dy) + dhnext
            dhrec = (1 - self.h_history[t] * self.h_history[t]) * dh
            dbh += dhrec
            dWxh += np.dot(dhrec, xs[t].T)
            dWhh += np.dot(dhrec, self.h_history[t - 1].T)
            dhnext = np.dot(self.Whh.T, dhrec)
        # for dparam in dparams:
        #     np.clip(dparam, -5, 5, out=dparam)
        return dparams

    def update_rprop(self, dparams, learning_rate):
        for param, dparam, dparam_prev, step_multiplier in zip(self.params, dparams, self.dparams_prev, self.step_multiplers):
            step_multiplier[np.sign(dparam) == np.sign(dparam_prev)] *= 1.2
            step_multiplier[np.sign(dparam) != np.sign(dparam_prev)] /= 2
            np.clip(step_multiplier, 0.01, 100, out=step_multiplier)
            np.copyto(dparam_prev, dparam)
            param -= dparam * step_multiplier * learning_rate


    def update(self, dparams, learning_rate):
        dWxh, dWhh, dWhy, dbh, dby = dparams
        self.Wxh -= dWxh * learning_rate
        self.Whh -= dWhh * learning_rate
        self.Why -= dWhy * learning_rate
        self.bh -= dbh * learning_rate
        self.by -= dby * learning_rate

    def train(self, xs, targets, iters, learning_rate, print_every = 1000):
            for i in range(iters):
                self.reset_history()
                preds = self.predict(xs)
                loss = rnn.loss(targets)
                if i % print_every == 0:
                    print('Predictions:', preds, 'vs targets:', targets)
                    print('Loss:', loss)
                dparams = rnn.backward(xs, targets)
                # self.update(dparams, learning_rate)
                self.update_rprop(dparams, learning_rate)

Let's try to learn the constant 0 function:

In [21]:
in_dim = 3
out_dim = 3
hidden_dim = 5
length = 10

def constant_dataset(c, length, in_out_dim):
    xs = np.random.uniform(0, 1, (length, in_out_dim, 1))
    targets = [c for _ in range(length)]
    return xs, targets

x_train, y_train = constant_dataset(0, length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 12.719989806603753
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.00015412750610943875
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 7.608251765159286e-05
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 5.0482052248195986e-05
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 3.7763630125127875e-05
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 3.0160158780055797e-05
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 2.510325223044181e-05
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 2.1497370724184777e-05
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 

Let's now try to learn the constant function 1:

In [22]:
x_train, y_train = constant_dataset(1, length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 12.50621225053801
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.00016356388592413178
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 8.020153064404667e-05
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 5.302897453094489e-05
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 3.957688686738098e-05
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 3.1554191855474104e-05
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 2.622818477104122e-05
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 2.2435940832623486e-05
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 

And the constant 2 function:

In [10]:
x_train, y_train = constant_dataset(2, length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 17.731359741756503
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.00016618437374716263
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 8.123644114930177e-05
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 5.3633031930150865e-05
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 3.998986657021812e-05
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 3.186194572657391e-05
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 2.6470325318382488e-05
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 2.2633714859698673e-05
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2

Now let's try a 0-1 sequence:

In [None]:
def periodic_dataset(seq, length, in_out_dim):
    xs = np.ones((length, in_out_dim, 1))
    targets = [seq[i % len(seq)] for i in range(length)]
    return xs, targets


In [42]:
in_dim = 3
out_dim = 3
hidden_dim = 10
length = 10

x_train, y_train = periodic_dataset([0, 1], length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 100_000, 0.0001)

Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 12.454533717449436
Predictions: [2, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 99.56032250973857
Predictions: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 477.3165250172769
Predictions: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 6.1931247769867594
Predictions: [0, 0, 1, 0, 1, 0, 1, 0, 1, 0] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 1682.780124493399
Predictions: [0, 0, 0, 1, 0, 0, 1, 0, 0, 1] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 358.07500502781863
Predictions: [0, 0, 1, 0, 0, 1, 0, 0, 1, 0] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 497.90830973469747
Predictions: [0, 0, 1, 1, 0, 1, 1, 0, 1, 1] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 877.4770602321423
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
Loss: 290.60788874396

Let's try a more complex sequence:

In [43]:
in_dim = 3
out_dim = 3
hidden_dim = 50
length = 10

x_train, y_train = periodic_dataset([0, 1, 2], length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 100000, 0.00001)

Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: 24.464625740047
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: 10.464110165957123
Predictions: [1, 1, 1, 2, 2, 2, 0, 0, 0, 0] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: 17.970561922238243
Predictions: [0, 1, 2, 1, 2, 1, 2, 1, 2, 1] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: 154.43149558500104
Predictions: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: 747.8514621502936
Predictions: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: 2884.8959551159032
  return sum(-np.log(self.y_history[t][targets[t], 0]) for t in range(self.seq_length))
Predictions: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: inf
Predictions: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] vs targets: [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
Loss: inf
Predictions: [0, 1, 0, 0, 1, 0, 1, 1, 0, 1] vs tar

KeyboardInterrupt: 

Let's try repeating the input:

In [20]:
def id_dataset(length, in_out_dim):
    xs = np.random.uniform(0, 2, (length, in_out_dim, 1))
    targets = [np.round(xs) for i in range(length)]
    return xs, target


In [None]:
in_dim = 3
out_dim = 3
hidden_dim = 10
length = 10

x_train, y_train = periodic_dataset([1, 0], length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 100000, 0.001)