In [42]:
from keras.utils.data_utils import get_file
import numpy as np
from theano import shared
import theano.tensor as T
import math
import theano
from itertools import chain
from collections import OrderedDict

In [3]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


In [5]:
chars.insert(0, "\0")
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

## Theano RNN

In [23]:
n_hidden = 256
n_input = vocab_size
n_output = vocab_size

In [20]:
def init_wgts(rows, cols): 
    #xavier init for weights
    scale = math.sqrt(2/rows)
    ## shared basically means want to pass off to theano and it will use that dat aon GPU
    return shared(np.random.normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows): 
    #zero init for bias
    return shared(np.zeros(rows, dtype=np.float32))
def wgts_and_bias(n_in, n_out): 
    return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n): 
    #returns identity init for weights
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

In [21]:
## These are our variables
# input
t_inp = T.matrix('inp')
#output
t_outp = T.matrix('outp')
#init first hidden state
t_h0 = T.vector('h0')
#learning rate
lr = T.scalar('lr')

#all args give theano
all_args = [t_h0, t_inp, t_outp, lr]

In [31]:
#hidden weights
W_h = id_and_bias(n_hidden)
#input weights
W_x = wgts_and_bias(n_input, n_hidden)
#output weights
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))

In [32]:
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    # Calculate the hidden activations
    h = T.nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    # feed hidden out softmax for y. generalization of softmax to K classes
    y = T.nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

In [37]:
## sequences = sequences want to iterate over
## outputs_info = initial state of necessary things
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

In [40]:
#cross entropy error predictions with actual
error = T.nnet.categorical_crossentropy(v_y, t_outp).sum()
#calc grad of error function wrt all weights
g_all = T.grad(error, w_all)

In [43]:
def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

## apply updates
upd = upd_dict(w_all, g_all, lr)

In [None]:
## combine everything. give it all the args. then what to calc. and pass the function to update every step.
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)