In [1]:
from theano.sandbox import cuda
#cuda.use('gpu1')
cuda.use('gpu0')

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 950 (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5110)
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29



In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [3]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600901


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [5]:
chars.insert(0, "\0")

In [6]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [7]:
chars[-6:] 

['\x86', '\xa4', '\xa6', '\xa9', '\xab', '\xc3']

Map from chars to indices and back again

In [8]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

*idx* will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)

In [9]:
idx = [char_indices[c] for c in text] # char -> index

In [10]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [11]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## 3 char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [12]:
cs=3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

Our inputs

In [13]:
x1 = np.stack(c1_dat[:-2]) # np.array(c1_dat[:-2]) also works.
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

Our output

In [14]:
y = np.stack(c4_dat[:-2])

In [15]:
idx[:13] # x1[0],x2[0],x3[0],y[0],
               #                                  x1[1],x2[1],x3[1],y[1],
               #                                                                   x1[2],x2[2],x3[2],y[2], 
               #                                                                                                    x1[3],x2[3],x3[3],y[3]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40]

The first 4 inputs and outputs

In [16]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [17]:
y[:4]

array([30, 29,  1, 40])

In [18]:
x1.shape, y.shape

((200297,), (200297,))

The number of latent factors to create (i.e. the size of the embedding matrix)

In [19]:
n_fac = 42

Create inputs and embedding outputs for each of our 3 character inputs

In [20]:
# input_length: 
#    Length of input sequences, when it is constant. This argument is required if you are going to connect
#    Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).

def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp) # one character input at one time
    return inp, Flatten()(emb)

In [21]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

### Create and train model

Pick a size for our hidden state

In [22]:
n_hidden = 256

This is the 'green arrow' from our diagram - the layer operation from input to hidden.

In [23]:
dense_in = Dense(n_hidden, activation='relu')

Our first hidden activation is simply this function applied to the result of the embedding of the first character.

In [24]:
c1_hidden = dense_in(c1) # c1: Embedding()

This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.

In [25]:
dense_hidden = Dense(n_hidden, activation='tanh')

Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.

In [26]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden) 
c2_hidden = merge([c2_dense, hidden_2]) # sum(c2_dense, dense_hidden(c1_hidden))

In [27]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3]) # sum(c3_dense, dense_hidden(c2_hidden))

<img src="http://wiki.fast.ai/images/8/85/Lect_6_nn_color_coded2.png" width="320">

This is the 'blue arrow' from our diagram - the layer operation from hidden to output.

In [28]:
dense_out = Dense(vocab_size, activation='softmax')

The third hidden state is the input to our output layer.

In [29]:
c4_out = dense_out(c3_hidden)

In [30]:
model = Model([c1_in, c2_in, c3_in], c4_out) # 3 consecutive　characters inputs, 1 character output

In [31]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam()) # Use sparse_categorical_crossentropy because y is array of integer

In [32]:
#model.optimizer.lr=0.000001 

# http://forums.fast.ai/t/lesson-6-discussion/245/26
model.optimizer.lr = 0.001 # Adam's default learning rate will be fine.

In [35]:
# [x1, x2, x3]: [(200297,), (200297,), (200297,)]
# y: (200297,)

model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f3a43699050>

In [36]:
model.optimizer.lr=0.01

In [37]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f3a453a2d50>

In [38]:
#model.optimizer.lr.set_value(0.000001) # This causes AttributeError: 'float' object has no attribute 'set_value' !
# http://forums.fast.ai/t/python-and-keras-questions-and-tips/224/18
model.optimizer.lr = 0.000001

In [39]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f3a4538d410>

In [40]:
#model.optimizer.lr.set_value(0.01)　# AttributeError: 'float' object has no attribute 'set_value'
model.optimizer.lr = 0.01

In [41]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f3a44ef2ed0>

### Test model

In [42]:
# Usage of np.newaxis
#    >>> np.array(1)
#    array(1)
#   >>> np.array(1)[np.newaxis]
#   array([1])
#    >>> np.array([1])[:, np.newaxis]
#    array([[1]])
#    >>> np.array([1])[np.newaxis:,]
#    array([1])

def get_next(inp):
    idxs = [char_indices[c] for c in inp] # text -> array of int
    arrs = [np.array(i)[np.newaxis] for i in idxs] # list of (1,)  ex. [array([69]), array([61]), array([62])]
    p = model.predict(arrs)  # (1, 86) where 86 is vocab_size
    i = np.argmax(p)
    return chars[i]

In [43]:
get_next('phi')

'l'

In [44]:
get_next(' th')

'e'

In [45]:
get_next(' an')

'd'

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [46]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [47]:
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
            for n in range(cs)]

# c_in_dat[0] idx=0, 8, 16, ...
# c_in_dat[1] idx=1, 9, 17, ...
# c_in_dat[2] idx=2, 10, 18, ...
# c_in_dat[3] idx=3, 11, 19, ...
# c_in_dat[4] idx=4, 12, 20, ...
# c_in_dat[5] idx=5, 13, 21, ...
# c_in_dat[6] idx=6, 14, 22, ...
# c_in_dat[7] idx=7, 15, 23, ...

In [48]:
print(len(c_in_dat))
print(len(c_in_dat[0]))

8
75112


Then create a list of the next character in each of these series. This will be the labels for our model.

In [49]:
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs, cs)] # idx= 8, 16, 24, ...

In [50]:
xs = [np.stack(c[:-2]) for c in c_in_dat] # list of numpy.ndarray

In [51]:
len(xs), xs[0].shape

(8, (75110,))

In [52]:
y = np.stack(c_out_dat[:-2]) # why -2 ??

So each column below is one series of 8 characters from the text.

In [53]:
[xs[n][:cs] for n in range(cs)]

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

...and this is the next character after each sequence.

In [54]:
y[:cs] # this is one character left shift of array([40,  1, 33,  2, 72, 67, 73,  2])

array([ 1, 33,  2, 72, 67, 73,  2, 68])

In [55]:
n_fac = 42

### Create and train model

In [56]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [57]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)] # 8 embedding inputs

In [58]:
n_hidden = 256

In [59]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

The first character of each sequence goes through dense_in(), to create our first hidden activations.

In [60]:
hidden = dense_in(c_ins[0][1]) # c_ins[0] is a tuple (inp, Flatten()(emb))

Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.

In [61]:
for i in range(1,cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

Putting the final hidden state through dense_out() gives us our output.

In [62]:
c_out = dense_out(hidden)

<img src="http://wiki.fast.ai/images/1/12/Lect_6_nn_color_coded.png" width=320 />

So now we can create our model.

In [73]:
model = Model([c[0] for c in c_ins], c_out) # c[0] is Input Tensor (c is a tuple)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam()) # use sparse_categorical_crossentropy since y is sparse label

In [74]:
# xs: [(755110,), (755110,), ... (755110,)]  8 inputs
# y: (755110, )

model.fit(xs, y, batch_size=64, nb_epoch=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f3a34434b10>

### Test model

In [75]:
def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp] # [(1,), (1,), ... (1,)] 8 characters
    p = model.predict(idxs) # (1, 86)
    return chars[np.argmax(p)]

In [76]:
get_next('for thos')

'e'

In [77]:
get_next('part of ')

't'

In [78]:
get_next('queens a')

'n'

## Our first RNN with keras!

In [79]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)

This is nearly exactly equivalent to the RNN we built ourselves in the previous section.

In [90]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs), # 8 characters input at one time
        SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
        Dense(vocab_size, activation='softmax')
    ])

In [91]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 8, 42)         3612        embedding_input_2[0][0]          
____________________________________________________________________________________________________
simplernn_2 (SimpleRNN)          (None, 256)           76544       embedding_5[0][0]                
____________________________________________________________________________________________________
dense_8 (Dense)                  (None, 86)            22102       simplernn_2[0][0]                
Total params: 102,258
Trainable params: 102,258
Non-trainable params: 0
____________________________________________________________________________________________________


In [92]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [93]:
# np.concatenate
#    np.concatenate(xs,axis=0): (600880, 1)  increase row, column fixed
#    np.concatenate(xs,axis=1): (75110, 8)  increase column, row fixed

# np.concatenate(xs, axis=1): (75110, 8)  n_sample, input characters
# y: (75110, )

model.fit(np.concatenate(xs,axis=1), y, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3a2b007f10>

In [94]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:] # (1, 8)
    p = model.predict(arrs)[0] # model.predict() returns (1, 86)
    return chars[np.argmax(p)]

In [95]:
get_next_keras('this is ')

'a'

In [96]:
get_next_keras('part of ')

't'

In [97]:
get_next_keras('queens a')

'n'

## Returning sequences

### Create inputs

To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)

Here, c_out_dat is identical to c_in_dat, but moved across 1 character.

In [98]:
#c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
#            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [99]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

Reading down each column shows one set of inputs and outputs.

In [100]:
[xs[n][:cs] for n in range(cs)]

[array([[40],
        [ 1],
        [33],
        [ 2],
        [72],
        [67],
        [73],
        [ 2]]), array([[42],
        [ 1],
        [38],
        [44],
        [ 2],
        [ 9],
        [61],
        [73]]), array([[29],
        [43],
        [31],
        [71],
        [54],
        [ 9],
        [58],
        [61]]), array([[30],
        [45],
        [ 2],
        [74],
        [ 2],
        [76],
        [67],
        [58]]), array([[25],
        [40],
        [73],
        [73],
        [76],
        [61],
        [24],
        [71]]), array([[27],
        [40],
        [61],
        [61],
        [68],
        [54],
        [ 2],
        [58]]), array([[29],
        [39],
        [54],
        [ 2],
        [66],
        [73],
        [33],
        [ 2]]), array([[ 1],
        [43],
        [73],
        [62],
        [54],
        [ 2],
        [72],
        [67]])]

In [101]:
[ys[n][:cs] for n in range(cs)]

[array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67]),
 array([ 1, 33,  2, 72, 67, 73,  2, 68])]

### Create and train model

In [102]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

We're going to pass a vector of all zeros as our starting point - here's our input layers for that:

In [103]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [104]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode='sum')
    # every layer now has an output
    outs.append(dense_out(hidden))

<img src="http://wiki.fast.ai/images/6/69/Lect_6_rnn_diagram.png" width="320" />

In [105]:
# Notice in our new graph that our input is now a vector of zeros. This is because we'd like to move
# our first character into the recurrent process, but we still need some vector to initialize 
# the first hidden layer and so we just use a vector of zeros.

model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [106]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape # 42 is n_fac

(75110, 42)

In [110]:
# [zeros]+xs:  [(75110, 42)] + [(75110, 1), (75110, 1), ... , (75110, 1)]
# ys: [(75110,), (75110,) ... (75110,)]  8 output sequences

model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f3a26223850>

### Test model

In [111]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs] # [(1,), (1,), ... (1,)]  8 characters
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs) # [(1, 86), (1, 86), .... (1, 86)]  8 characters
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [112]:
get_nexts(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 't', ' ', 's', 'n', ' ']

In [113]:
get_nexts(' part of')

[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']


['t', 'o', 's', 't', ' ', 'o', 'f', ' ']

### Sequence model with keras

In [128]:
n_hidden, n_fac, cs, vocab_size

(256, 42, 8, 86)

To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.

In [129]:
# https://www.quora.com/What-is-time-distributed-dense-layer-in-Keras
# https://datascience.stackexchange.com/questions/10836/the-difference-between-dense-and-timedistributeddense-of-keras

model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')), #  apply fully connected dense on each time step and get output separately by timesteps
    ])

In [130]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 8, 42)         3612        embedding_input_3[0][0]          
____________________________________________________________________________________________________
simplernn_3 (SimpleRNN)          (None, 8, 256)        76544       embedding_6[0][0]                
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 8, 86)         22102       simplernn_3[0][0]                
Total params: 102,258
Trainable params: 102,258
Non-trainable params: 0
____________________________________________________________________________________________________


In [131]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [132]:
xs[0].shape

(75110, 1)

In [133]:
ys[0].shape

(75110, 1)

In [134]:
x_rnn=np.stack(np.squeeze(xs), axis=1) # xs = [(75110, 1), (75110, 1), ...]
y_rnn=np.atleast_3d(np.stack(ys, axis=1))

In [135]:
x_rnn.shape, y_rnn.shape

((75110, 8), (75110, 8, 1))

In [136]:
y_rnn[0]

array([[42],
       [29],
       [30],
       [25],
       [27],
       [29],
       [ 1],
       [ 1]])

In [137]:
# x_rnn: (75110, 8)
# y_rnn: (75110, 8, 1) each output sequence has a label of (1,)

model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f3a1e7fb390>

In [138]:
def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp] # len=8, ex. [2, 73, 61, 62, 72, 2, 62, 72]
    arr = np.array(idxs)[np.newaxis,:] # (1, 8)
    p = model.predict(arr)[0] # model.predict(arr) returns (1, 8, 86)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [139]:
get_nexts_keras(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 'n', ' ', 'p', 'n', ' ']

### One-hot sequence model with keras

This is the keras version of the theano model that we're about to create.

In [142]:
# We don't use Embedding !
model=Sequential([
        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size), 
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam()) # NOT sparse_categorical_crossentropy

#  cf. Embedding version model
#model=Sequential([
#        Embedding(vocab_size, n_fac, input_length=cs),
#        SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
#        TimeDistributed(Dense(vocab_size, activation='softmax')),
#    ])

In [143]:
# oh stands for one hot
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape 

# cf. embedding version returns ((75110, 8), (75110, 8, 1))

((75110, 8, 86), (75110, 8, 86))

In [144]:
oh_x_rnn[0][0] # one hot vector

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [145]:
# oh_x_rnn: (75110, 8, 86)
# oh_y_rnn: (75110, 8, 86)

model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f39babfabd0>

In [146]:
def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size) # (8, 86)
    p = model.predict(arr[np.newaxis,:])[0] # arr[np.newaxis,:]=(1, 8, 86), model.predict() returns (1, 8, 86)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [147]:
get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 's', ' ', 'i', 's', ' ']

#### Cheat sheet

| | First RNN model | Sequence model | One-hot sequence model |
|-----------|------------|------------|----------|
| Word representing  | Embeddings | Embeddings | One-hot encoding |
| input_length (Embeddings) | seq_len |  seq_len  | - |
| return_sequences (Recurrent layer)   | False  | True | True |
| TimeDistributed layer | ☓ | ◯ | ◯ |
| X       | (n_sample, seq_len) | (n_sample, seq_len)  | (n_sample, seq_len, vocab_size) |
| y       | (n_sample, )  | (n_sample, seq_len, 1)  | (n_sample, seq_len, vocab_size) |
| pred   | (1, vocab_size) | (1, seq_len, vocab_size)  | (1, seq_len, vocab_size) |


## Stateful model with keras

In [163]:
bs=64

A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.

When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.

In [164]:
# stateful: Boolean (default False). If True, the last state for each sample at index i in a batch will be used 
# as initial state for the sample of index i in the following batch.

# See links below for good explanation
#    http://forums.fast.ai/t/lesson-6-discussion/245/78
#    http://philipperemy.github.io/keras-stateful-lstm/ 


model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True), # LSTM may find dependencies between the sequences
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [165]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.

In [166]:
mx = len(x_rnn)//bs*bs # 75110 -> 75072

In [169]:
# x_rnn[:mx]:  (75072, 8)
# y_rnn[:mx]:  (75072, 8, 1)

model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False) # shuffle must be False because stateful is True

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe26e6fa310>

In [170]:
model.optimizer.lr=1e-4

In [171]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe26ced3850>

In [172]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe2d647fd50>

## Theano RNN

In [173]:
n_input = vocab_size
n_output = vocab_size

In [25]:
 ## You may have to run these lines when you resume from 'Theano RNN' ##
n_hidden = 256
cs=8
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)]
            for n in range(cs)]
xs = [np.stack(c[:-2]) for c in c_in_dat]
ys = [np.stack(c[:-2]) for c in c_out_dat]
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)
oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

Using raw theano, we have to create our weight matrices and bias vectors ourselves - here are the functions we'll use to do so (using glorot initialization).

The return values are wrapped in `shared()`, which is how we tell theano that it can manage this data (copying it to and from the GPU as necessary).

In [174]:
def init_wgts(rows, cols): 
    scale = math.sqrt(2/rows)
    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows): 
    return shared(np.zeros(rows, dtype=np.float32))

We return the weights and biases together as a tuple. For the hidden weights, we'll use an identity initialization (as recommended by [Hinton](https://arxiv.org/abs/1504.00941).)

In [175]:
def wgts_and_bias(n_in, n_out): 
    return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n): 
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

Theano doesn't actually do any computations until we explicitly compile and evaluate the function (at which point it'll be turned into CUDA code and sent off to the GPU). So our job is to describe the computations that we'll want theano to do - the first step is to tell theano what inputs we'll be providing to our computation:

In [176]:
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

Now we're ready to create our intial weight matrices.

In [177]:
W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))

Theano handles looping by using the [GPU scan](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html) operation. We have to tell theano what to do at each step through the scan - this is the function we'll use, which does a single forward pass for one character:

In [178]:
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    # Calculate the hidden activations
    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    # Calculate the output activations
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    # Return both (the 'Flatten()' is to work around a theano bug)
    return h, T.flatten(y, 1)

Now we can provide everything necessary for the scan operation, so we can setup that up - we have to pass in the function to call at each step, the sequence to step through, the initial values of the outputs, and any other arguments to pass to the step function.

In [179]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

We can now calculate our loss function, and *all* of our gradients, with just a couple of lines of code!

In [180]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

We even have to show theano how to do SGD - so we set up this dictionary of updates to complete after every forward pass, which apply to standard SGD update rule to every weight.

In [181]:
def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

upd = upd_dict(w_all, g_all, lr)

We're finally ready to compile the function!

In [182]:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [183]:
X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape

((75110, 8, 86), (75110, 8, 86))

To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time.

In [184]:
err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.3f}".format(err/1000))
        err=0.0

Error:25.182
Error:21.446
Error:20.864
Error:19.820
Error:18.726
Error:19.201
Error:19.072
Error:18.453
Error:17.937
Error:18.214
Error:17.495
Error:17.639
Error:18.485
Error:17.352
Error:16.821
Error:17.781
Error:17.408
Error:17.175
Error:16.863
Error:16.737
Error:16.616
Error:16.452
Error:16.699
Error:16.184
Error:16.828
Error:16.661
Error:16.075
Error:16.244
Error:16.268
Error:16.468
Error:16.809
Error:16.484
Error:16.768
Error:16.402
Error:16.046
Error:16.729
Error:16.031
Error:16.463
Error:16.117
Error:16.306
Error:15.388
Error:15.772
Error:15.841
Error:15.980
Error:16.042
Error:15.908
Error:15.687
Error:16.111
Error:16.027
Error:16.065
Error:15.267
Error:15.570
Error:15.011
Error:14.855
Error:15.673
Error:15.477
Error:14.792
Error:15.532
Error:15.133
Error:15.037
Error:14.979
Error:15.485
Error:15.383
Error:15.102
Error:14.765
Error:14.888
Error:14.323
Error:14.708
Error:15.256
Error:14.764
Error:15.222
Error:14.774
Error:14.482
Error:14.562
Error:14.502


In [185]:
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True) # prediction

In [186]:
pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)

In [187]:
act = np.argmax(X[6], axis=1)

In [188]:
[indices_char[o] for o in act]

['t', 'h', 'e', 'n', '?', ' ', 'I', 's']

In [189]:
[indices_char[o] for o in pred]

['h', 'e', ' ', ' ', ' ', 't', 't', ' ']

## Pure python RNN!

### Set up basic functions

Now we're going to try to repeat the above theano RNN, using just pure python (and numpy). Which means, we have to do everything ourselves, including defining the basic functions of a neural net! Below are all of the definitions, along with tests to check that they give the same answers as theano. The functions ending in `_d` are the derivatives of each function.

In [190]:
def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_d(x): 
    output = sigmoid(x)
    return output*(1-output)

In [191]:
def relu(x): return np.maximum(0., x)
def relu_d(x): return (x > 0.)*1. # 0.0 or 1.0

In [192]:
relu(np.array([3.,-3.])), relu_d(np.array([3.,-3.]))

(array([ 3.,  0.]), array([ 1.,  0.]))

In [193]:
def dist(a,b): return pow(a-b,2)
def dist_d(a,b): return 2*(a-b)

In [194]:
import pdb

In [195]:
eps = 1e-7
def x_entropy(pred, actual):  # x_entropy means 'cross entropy'
    return -np.sum(actual * np.log(np.clip(pred, eps, 1-eps)))
def x_entropy_d(pred, actual): return -actual/pred

In [196]:
def softmax(x): return np.exp(x)/np.exp(x).sum()

In [198]:
def softmax_d(x):
    sm = softmax(x)
    res = np.expand_dims(-sm,-1)*sm
    res[np.diag_indices_from(res)] = sm*(1-sm)
    return res

In [199]:
test_preds = np.array([0.2,0.7,0.1])
test_actuals = np.array([0.,1.,0.])
nnet.categorical_crossentropy(test_preds, test_actuals).eval() # nnet is Theano's library

array(0.35667494393873245)

In [200]:
x_entropy(test_preds, test_actuals) # should returns same value of test_grad() 

0.35667494393873245

In [201]:
test_inp = T.dvector()
test_out = nnet.categorical_crossentropy(test_inp, test_actuals)
test_grad = theano.function([test_inp], T.grad(test_out, test_inp))

In [202]:
test_grad(test_preds)

array([-0.    , -1.4286, -0.    ])

In [203]:
x_entropy_d(test_preds, test_actuals) # should returns same value of test_grad() 

array([-0.    , -1.4286, -0.    ])

In [205]:
pre_pred = random(oh_x_rnn[0][0].shape)
preds = softmax(pre_pred)
actual = oh_x_rnn[0][0]

In [206]:
np.allclose(softmax_d(pre_pred).dot(x_entropy_d(preds,actual)), preds-actual)

True

In [208]:
softmax(test_preds) # should return same value below

array([ 0.2814,  0.464 ,  0.2546])

In [209]:
nnet.softmax(test_preds).eval()

array([[ 0.2814,  0.464 ,  0.2546]])

In [210]:
test_out = T.flatten(nnet.softmax(test_inp))

In [211]:
test_grad = theano.function([test_inp], theano.gradient.jacobian(test_out, test_inp))

In [212]:
test_grad(test_preds)

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [213]:
softmax_d(test_preds) # should return value of test_grad()

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [214]:
act=relu
act_d = relu_d

In [215]:
loss=x_entropy
loss_d=x_entropy_d

We also have to define our own scan function. Since we're not worrying about running things in parallel, it's very simple to implement:

In [216]:
def scan(fn, start, seq):
    res = []
    prev = start
    for s in seq:
        app = fn(prev, s)
        res.append(app)
        prev = app
    return res

...for instance, `scan` on `+` is the cumulative sum.

In [217]:
scan(lambda prev,curr: prev+curr, 0, range(5))

[0, 1, 3, 6, 10]

### Set up training

Let's now build the functions to do the forward and backward passes of our RNN. First, define our data and shape.

In [218]:
inp = oh_x_rnn
outp = oh_y_rnn
n_input = vocab_size
n_output = vocab_size

In [219]:
inp.shape, outp.shape

((75110, 8, 86), (75110, 8, 86))

Here's the function to do a single forward pass of an RNN, for a single character.

In [220]:
def one_char(prev, item):
    # Previous state
    tot_loss, pre_hidden, pre_pred, hidden, ypred = prev
    # Current inputs and output
    x, y = item
    pre_hidden = np.dot(x,w_x) + np.dot(hidden,w_h)
    hidden = act(pre_hidden)
    pre_pred = np.dot(hidden,w_y)
    ypred = softmax(pre_pred)
    return (
        # Keep track of loss so we can report it
        tot_loss+loss(ypred, y),
        # Used in backprop
        pre_hidden, pre_pred, 
        # Used in next iteration
        hidden, 
        # To provide predictions
        ypred)

We use `scan` to apply the above to a whole sequence of characters.

In [221]:
def get_chars(n): return zip(inp[n], outp[n])
def one_fwd(n): return scan(one_char, (0,0,0,np.zeros(n_hidden),0), get_chars(n))

Now we can define the backward step. We use a loop to go through every element of the sequence. The derivatives are applying the chain rule to each step, and accumulating the gradients across the sequence.

In [222]:
# "Columnify" a vector
def col(x): return x[:,newaxis]

def one_bkwd(args, n):
    global w_x,w_y,w_h

    i=inp[n]  # 8x86
    o=outp[n] # 8x86
    d_pre_hidden = np.zeros(n_hidden) # 256
    for p in reversed(range(len(i))):
        totloss, pre_hidden, pre_pred, hidden, ypred = args[p]
        x=i[p] # 86
        y=o[p] # 86
        d_pre_pred = softmax_d(pre_pred).dot(loss_d(ypred,y))  # 86
        d_pre_hidden = (np.dot(d_pre_hidden, w_h.T) 
                        + np.dot(d_pre_pred,w_y.T)) * act_d(pre_hidden) # 256

        # d(loss)/d(w_y) = d(loss)/d(pre_pred) * d(pre_pred)/d(w_y)
        w_y -= col(hidden) * d_pre_pred * alpha
        # d(loss)/d(w_h) = d(loss)/d(pre_hidden[p-1]) * d(pre_hidden[p-1])/d(w_h)
        if (p>0): w_h -= args[p-1][3].dot(d_pre_hidden) * alpha
        w_x -= col(x)*d_pre_hidden * alpha
    return d_pre_hidden

Now we can set up our initial weight matrices. Note that we're not using bias at all in this example, in order to keep things simpler.

In [223]:
scale=math.sqrt(2./n_input)
w_x = normal(scale=scale, size=(n_input,n_hidden))
w_y = normal(scale=scale, size=(n_hidden, n_output))
w_h = np.eye(n_hidden, dtype=np.float32)

Our loop looks much like the theano loop in the previous section, except that we have to call the backwards step ourselves.

In [224]:
overallError=0
alpha=0.0001
for n in range(10000):
    res = one_fwd(n)
    overallError+=res[-1][0]
    deriv = one_bkwd(res, n)
    if(n % 1000 == 999):
        print ("Error:{:.4f}; Gradient:{:.5f}".format(
                overallError/1000, np.linalg.norm(deriv)))
        overallError=0

Error:35.9912; Gradient:1.72601
Error:35.7669; Gradient:1.68163
Error:35.6316; Gradient:2.00085
Error:35.4998; Gradient:1.70812
Error:35.3777; Gradient:1.59752
Error:35.1609; Gradient:2.27451
Error:33.2246; Gradient:3.68605
Error:31.0654; Gradient:3.75729
Error:29.7459; Gradient:3.95957
Error:29.2273; Gradient:3.85943


## Keras GRU

Identical to the last keras rnn, but a GRU!

In [225]:
# We are using one-hot sequence inputs (not using Embedding)

model=Sequential([
        GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

# cf. Keras RNN version
#model=Sequential([
#        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
#                  activation='relu', inner_init='identity'),
#        TimeDistributed(Dense(vocab_size, activation='softmax')),
#    ])

In [229]:
# oh_x_rnn: (75110, 8, 86)
# oh_y_rnn: (75110, 8, 86)

model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fe24b06c6d0>

In [231]:
get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 's', ' ', 'c', 's', ' ']

## Theano GRU

### Separate weights

The theano GRU looks just like the simple theano RNN, except for the use of the reset and update gates. Each of these gates requires its own hidden and input weights, so we add those to our weight matrices.

In [232]:
W_h = id_and_bias(n_hidden)
W_x = init_wgts(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
rW_h = init_wgts(n_hidden, n_hidden)
rW_x = wgts_and_bias(n_input, n_hidden)
uW_h = init_wgts(n_hidden, n_hidden)
uW_x = wgts_and_bias(n_input, n_hidden)
w_all = list(chain.from_iterable([W_h, W_y, uW_x, rW_x]))
w_all.extend([W_x, uW_h, rW_h])

Here's the definition of a gate - it's just a sigmoid applied to the addition of the dot products of the input vectors.

In [233]:
def gate(x, h, W_h, W_x, b_x):
    return nnet.sigmoid(T.dot(x, W_x) + b_x + T.dot(h, W_h))

Our step is nearly identical to before, except that we multiply our hidden state by our reset gate, and we update our hidden state based on the update gate.

In [234]:
def step(x, h, W_h, b_h, W_y, b_y, uW_x, ub_x, rW_x, rb_x, W_x, uW_h, rW_h):
    reset = gate(x, h, rW_h, rW_x, rb_x)
    update = gate(x, h, uW_h, uW_x, ub_x)
    h_new = gate(x, h * reset, W_h, W_x, b_h)
    h = update*h + (1-update)*h_new
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

Everything from here on is identical to our simple RNN in theano.

In [235]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

In [236]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [237]:
upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [238]:
err=0.0; l_rate=0.1
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        l_rate *= 0.95
        print ("Error:{:.2f}".format(err/1000))
        err=0.0

Error:27.14
Error:22.74
Error:22.22
Error:21.26
Error:20.33
Error:20.68
Error:20.35
Error:19.75
Error:19.51
Error:19.77
Error:19.01
Error:19.11
Error:19.84
Error:18.91
Error:18.35
Error:19.52
Error:19.29
Error:18.94
Error:18.26
Error:18.12
Error:17.85
Error:17.86
Error:18.36
Error:17.85
Error:18.13
Error:17.95
Error:17.71
Error:17.77
Error:17.78
Error:17.98
Error:18.31
Error:17.88
Error:18.15
Error:17.77
Error:17.57
Error:18.26
Error:17.48
Error:18.04
Error:17.55
Error:17.70
Error:17.04
Error:17.45
Error:17.32
Error:17.65
Error:17.60
Error:17.70
Error:17.46
Error:18.44
Error:17.44
Error:17.72
Error:17.04
Error:17.43
Error:16.74
Error:16.91
Error:17.59
Error:17.36
Error:16.98
Error:17.39
Error:17.30
Error:17.17
Error:16.89
Error:17.35
Error:17.13
Error:17.16
Error:16.86
Error:16.91
Error:16.81
Error:16.73
Error:17.34
Error:16.71
Error:17.29
Error:16.80
Error:16.61
Error:16.56
Error:16.49


### Combined weights

We can make the previous section simpler and faster by concatenating the hidden and input matrices and inputs together. We're not going to step through this cell by cell - you'll see it's identical to the previous section except for this concatenation.

In [239]:
W = (shared(np.concatenate([np.eye(n_hidden), normal(size=(n_input, n_hidden))])
            .astype(np.float32)), init_bias(n_hidden))

rW = wgts_and_bias(n_input+n_hidden, n_hidden)
uW = wgts_and_bias(n_input+n_hidden, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W, W_y, uW, rW]))

In [240]:
def gate(m, W, b): return nnet.sigmoid(T.dot(m, W) + b)

In [241]:
def step(x, h, W, b, W_y, b_y, uW, ub, rW, rb):
    m = T.concatenate([h, x])
    reset = gate(m, rW, rb)
    update = gate(m, uW, ub)
    m = T.concatenate([h*reset, x])
    h_new = gate(m, W, b)
    h = update*h + (1-update)*h_new
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

In [242]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

In [243]:
def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

In [244]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [245]:
upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [246]:
err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.2f}".format(err/1000))
        err=0.0

Error:24.79
Error:22.18
Error:22.02
Error:21.27
Error:20.51
Error:21.01
Error:20.73
Error:20.18
Error:19.98
Error:20.30
Error:19.57
Error:19.68
Error:20.32
Error:19.52
Error:19.00
Error:19.95
Error:19.71
Error:19.62
Error:18.97
Error:18.82
Error:18.48
Error:18.51
Error:19.07
Error:18.48
Error:18.75
Error:18.54
Error:18.32
Error:18.32
Error:18.32
Error:18.45
Error:18.76
Error:18.35
Error:18.60
Error:18.28
Error:18.01
Error:18.55
Error:17.86
Error:18.39
Error:17.93
Error:18.01
Error:17.35
Error:17.79
Error:17.62
Error:17.93
Error:17.79
Error:17.84
Error:17.64
Error:17.77
Error:17.67
Error:17.75
Error:17.12
Error:17.33
Error:16.73
Error:16.79
Error:17.43
Error:17.22
Error:16.74
Error:17.30
Error:17.01
Error:16.88
Error:16.67
Error:17.08
Error:16.90
Error:16.73
Error:16.54
Error:16.57
Error:16.29
Error:16.49
Error:17.00
Error:16.47
Error:16.82
Error:16.35
Error:16.18
Error:16.14
Error:16.11


### End