Inspiration: https://towardsdatascience.com/recurrent-neural-networks-rnns-3f06d7653a85

# Imports and globals

In [137]:
import numpy as np

# Basic functions

In [138]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

def sigmoid_derivative(sigmoid_output):
    return sigmoid_output * (1 - sigmoid_output)

def softmax_naive(x):
    return np.exp(x) / np.sum(np.exp(x))

def softmax_stable(x):
    p = np.exp(x - np.max(x))
    return p / np.sum(p)

softmax = softmax_stable

softmax(np.array([range(3)]))

array([[0.09003057, 0.24472847, 0.66524096]])

In [330]:
class RNN:
    def __init__(self, input_dim:int, hidden_dim:int, output_dim:int):
        # x, h, and y are all column vectors
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.Wxh = np.random.uniform(0, 1, (hidden_dim, input_dim))
        self.Whh = np.random.uniform(0, 1, (hidden_dim, hidden_dim))
        self.Why = np.random.uniform(0, 1, (output_dim, hidden_dim))
        self.bh = np.random.uniform(0, 1, self.h_shape)
        self.by = np.random.uniform(0, 1, self.y_shape)
        self.reset_history()

    @property
    def x_shape(self): return (self.input_dim, 1)
    
    @property
    def h_shape(self): return (self.hidden_dim, 1)

    @property
    def y_shape(self): return (self.output_dim, 1)

    def reset_history(self):
        h = np.zeros(self.h_shape)
        self.h_history = [h]
        self.y_history = []

    @property
    def seq_length(self):
        return len(self.y_history)

    @property
    def h(self):
        return self.h_history[-1]

    @h.setter
    def h(self, value):
        assert value.shape == self.h_shape, (value.shape, self.h_shape)
        self.h_history.append(value)

    @property
    def y(self):
        return self.y_history[-1]

    @y.setter
    def y(self, value):
        assert value.shape == (self.output_dim, 1), (value.shape, self.y_shape)
        self.y_history.append(value)

    def forward_one_step(self, x):
        assert x.shape == self.x_shape, (x.shape, self.x_shape)
        z = np.dot(self.Wxh, x) + np.dot(self.Whh, self.h) + self.bh
        self.h = np.tanh(z)
        o = np.dot(self.Why, self.h) + self.by
        self.y = softmax(o)
        return self.y

    def forward(self, xs):
        return [self.forward_one_step(x) for x in xs]

    def predict_one_step(self, x):
        return np.argmax(self.forward_one_step(x))

    def predict(self, xs):
        return [self.predict_one_step(x) for x in xs]

    def loss(self, targets):
        return sum(-np.log(self.y_history[t][targets[t], 0]) for t in range(self.seq_length))

    def backward(self, xs, targets):
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dWhy = np.zeros_like(self.Why)
        dbh = np.zeros_like(self.bh)
        dby = np.zeros_like(self.by)
        dparams = (dWxh, dWhh, dWhy, dbh, dby)
        dhnext = np.zeros_like(self.h)
        for t in reversed(range(self.seq_length)):
            dy = np.copy(self.y_history[t])
            dy[targets[t]] -= 1
            dWhy += np.dot(dy, self.h_history[t].T)
            dby += dby
            dh = np.dot(self.Why.T, dy) + dhnext
            dhrec = (1 - self.h_history[t] * self.h_history[t]) * dh
            dbh += dhrec
            dWxh += np.dot(dhrec, xs[t].T)
            dWhh += np.dot(dhrec, self.h_history[t - 1].T)
            dhnext = np.dot(self.Whh.T, dhrec)
        for dparam in dparams:
            np.clip(dparam, -5, 5, out=dparam)
        return dparams

    def update(self, dparams, learning_rate):
        dWxh, dWhh, dWhy, dbh, dby = dparams
        self.Wxh -= dWxh * learning_rate
        self.Whh -= dWhh * learning_rate
        self.Why -= dWhy * learning_rate
        self.bh -= dbh * learning_rate
        self.by -= dby * learning_rate

    def train(self, xs, targets, iters, learning_rate, print_every = 1000):
            for i in range(iters):
                self.reset_history()
                preds = self.predict(xs)
                loss = rnn.loss(targets)
                if i % print_every == 0:
                    print('Predictions:', preds, 'vs targets:', targets)
                    print('Loss:', loss)
                dparams = rnn.backward(xs, targets)
                self.update(dparams, learning_rate)

Let's try to learn the constant 0 function:

In [336]:
in_dim = 3
out_dim = 3
hidden_dim = 5
length = 10

def constant_dataset(c, length, in_out_dim):
    xs = np.random.uniform(0, 1, (length, in_out_dim, 1))
    targets = [c for _ in range(length)]
    return xs, targets

x_train, y_train = constant_dataset(0, length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 8.24779750773102
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.016003682858074433
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.007953920998736316
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.0052842645401016694
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.003953525732722958
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.0031568647670950495
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.002626694151580089
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loss: 0.0022485536290676967
Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Lo

Let's now try to learn the constant function 1:

In [337]:
x_train, y_train = constant_dataset(1, length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 9.192803410068464
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.016014932724079348
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.007952570889906118
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.005281430417648357
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.003950582686721133
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.003154086307012429
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.0026241300319009607
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Loss: 0.0022461973557770954
Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Lo

And the constant 2 function:

In [338]:
x_train, y_train = constant_dataset(2, length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 16.252170119444525
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.01743541728278837
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.008559397861840328
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.005650382980376492
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.004209869439123079
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.0033513671005677213
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.002781967741385888
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Loss: 0.0023769326596804635
Predictions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vs targets: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Lo

Now let's try a 0-1 sequence:

In [341]:
def periodic_dataset(seq, length, in_out_dim):
    xs = np.random.uniform(0, 1, (length, in_out_dim, 1))
    targets = [seq[i % len(seq)] for i in range(length)]
    return xs, targets


In [351]:
in_dim = 3
out_dim = 3
hidden_dim = 10
length = 10

x_train, y_train = periodic_dataset([1, 0], length, in_dim)
rnn = RNN(in_dim, hidden_dim, out_dim)
rnn.train(x_train, y_train, 10000, 0.01)

Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: 10.392026718231847
Predictions: [0, 2, 1, 1, 0, 0, 2, 2, 1, 0] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: 23.71322321868329
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: 600.6784858483779
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: 1464.7921827473867
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: 2329.2495269284645
  return sum(-np.log(self.y_history[t][targets[t], 0]) for t in range(self.seq_length))
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: inf
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: inf
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
Loss: inf
Predictions: [0, 1, 0, 1, 0, 1, 0, 0, 0, 1] vs targets: [1, 0, 

Let's try repeating the input:

In [None]:
def id_dataset(length, in_out_dim):
    xs = np.random.uniform(0, 2, (length, in_out_dim, 1))
    targets = [np.round(xs.) for i in range(length)]
    return xs, targets
