In [1]:
%matplotlib inline

import utils_ted
from utils_ted import *

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
batch_size = 64

In [3]:
from keras.layers import TimeDistributed, Activation
from numpy.random import choice

[Keras 2.0 release notes](https://github.com/fchollet/keras/wiki/Keras-2.0-release-notes)

```
Recurrent layers
    output_dim -> units
    init -> kernel_initializer
    inner_init -> recurrent_initializer
    added argument bias_initializer
    W_regularizer -> kernel_regularizer
    b_regularizer -> bias_regularizer
    added arguments kernel_constraint, recurrent_constraint, bias_constraint
    dropout_W -> dropout
    dropout_U -> recurrent_dropout
    consume_less -> implementation. String values have been replaced with integers: implementation 0 (default), 1 or 2.
    LSTM only: the argument forget_bias_init has been removed. Instead there is a boolean argument unit_forget_bias, defaulting to True.
```

## Setup

We haven't really looked into the detail of how this works yet - so this is provided for self-study for those who are interested. We'll look at it closely next week.

In [4]:
path=get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path, encoding='utf8').read().lower()
#text = open(path, encoding='utf8').read()

In [5]:
print('corpus length:', len(text))

corpus length: 600893


In [6]:
!tail {path} -n25

are thinkers who believe in the saints.


144

It stands to reason that this sketch of the saint, made upon the model
of the whole species, can be confronted with many opposing sketches that
would create a more agreeable impression. There are certain exceptions
among the species who distinguish themselves either by especial
gentleness or especial humanity, and perhaps by the strength of their
own personality. Others are in the highest degree fascinating because
certain of their delusions shed a particular glow over their whole
being, as is the case with the founder of christianity who took himself
for the only begotten son of God and hence felt himself sinless; so that
through his imagination--that should not be too harshly judged since the
whole of antiquity swarmed with sons of god--he attained the same goal,
the sense of complete sinlessness, complete irresponsibility, that can
now be attained by every individual through science.--In the same manner
I have viewed t

In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1

In [8]:
print("total chars : %s" % vocab_size)

total chars : 58


In [9]:
chars.insert(0, '/n')

In [10]:
"".join(chars[1:-5])

'\n !"\'(),-.0123456789:;=?[]_abcdefghijklmnopqrstuvwxy'

In [11]:
char_indices = {c:i for i, c in enumerate(chars)}
indices_char = {i:c for i, c in enumerate(chars)}

In [12]:
text_idxs = [char_indices[c] for c in text]

In [13]:
print(text_idxs[:10])

[43, 45, 32, 33, 28, 30, 32, 1, 1, 1]


In [14]:
''.join(indices_char[idx] for idx in text_idxs[:70])

'preface\n\n\nsupposing that truth is a woman--what then? is there not gro'

## 3 char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [15]:
cs = 3
c1_data = [text_idxs[i] for i in range(0, len(text_idxs) - (cs+1), cs)]
c2_data = [text_idxs[i+1] for i in range(0, len(text_idxs) - (cs+1), cs)]
c3_data = [text_idxs[i+2] for i in range(0, len(text_idxs) - (cs+1), cs)]
c4_data = [text_idxs[i+3] for i in range(0, len(text_idxs) - (cs+1), cs)]

Our inputs

In [16]:
x1 = np.array(c1_data[:-2])
x2 = np.array(c2_data[:-2])
x3 = np.array(c3_data[:-2])

Our output

In [17]:
y = np.array(c4_data[:-2])

The first 4 inputs and outputs

In [18]:
x1[:4], x2[:4], x3[:4], y[:4]

(array([43, 33, 32,  1]),
 array([45, 28,  1, 46]),
 array([32, 30,  1, 48]),
 array([33, 32,  1, 43]))

In [19]:
x1.shape, y.shape

((200295,), (200295,))

The number of latent factors to create (i.e. the size of the embedding matrix)

In [20]:
n_fac = 42

Create inputs and embedding outputs for each of our 3 character inputs

In [21]:
def embedding_input(name, n_in, n_out):
    inp = Input((1, ), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [22]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

### Create and train model

Pick a size for our hidden state

In [23]:
n_hidden = 256

This is the 'green arrow' from our diagram - the layer operation from input to hidden.

In [24]:
dense_in = Dense(n_hidden, activation='relu')

Our first hidden activation is simply this function applied to the result of the embedding of the first character.

In [25]:
c1_hidden = dense_in(c1)

This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.

In [26]:
dense_hidden = Dense(n_hidden, activation='tanh')

Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.

In [27]:
from keras.layers import Add

In [28]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = Add()([c2_dense, hidden_2])

In [29]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = Add()([c3_dense, hidden_3])

This is the 'blue arrow' from our diagram - the layer operation from hidden to output.

In [30]:
dense_out = Dense(vocab_size, activation='softmax')

The third hidden state is the input to our output layer.

In [31]:
c4_out = dense_out(c3_hidden)

In [32]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [33]:
model.compile(Adam(), loss='sparse_categorical_crossentropy')

In [34]:
model.optimizer.lr = 1e-6

In [35]:
model.fit([x1, x2, x3], y, batch_size=batch_size, epochs=4, verbose=2)

Epoch 1/4
 - 19s - loss: 4.0575
Epoch 2/4
 - 17s - loss: 4.0513
Epoch 3/4
 - 16s - loss: 4.0450
Epoch 4/4
 - 16s - loss: 4.0386


<keras.callbacks.History at 0x7f44af54c080>

In [36]:
model.optimizer.lr = 0.01

In [37]:
model.fit([x1, x2, x3], y, batch_size=batch_size, epochs=4, verbose=2)

Epoch 1/4
 - 16s - loss: 4.0320
Epoch 2/4
 - 16s - loss: 4.0253
Epoch 3/4
 - 16s - loss: 4.0182
Epoch 4/4
 - 16s - loss: 4.0108


<keras.callbacks.History at 0x7f44af5a9898>

In [38]:
model.optimizer.lr = 1e-6

In [39]:
model.fit([x1, x2, x3], y, batch_size=batch_size, epochs=4, verbose=2)

Epoch 1/4
 - 16s - loss: 4.0031
Epoch 2/4
 - 16s - loss: 3.9949
Epoch 3/4
 - 16s - loss: 3.9862
Epoch 4/4
 - 16s - loss: 3.9769


<keras.callbacks.History at 0x7f44af12e048>

In [252]:
model.optimizer.lr = 0.01

In [253]:
model.fit([x1, x2, x3], y, batch_size=batch_size, epochs=4, verbose=2)

Epoch 1/4
 - 16s - loss: 3.9519
Epoch 2/4
 - 17s - loss: 3.9397
Epoch 3/4
 - 16s - loss: 3.9268
Epoch 4/4
 - 16s - loss: 3.9129


<keras.callbacks.History at 0x7f2c74e972e8>

### Test model

In [40]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    #arrs = [np.array(i).reshape(1,) for i in idxs] # to fit in the Input() input shape
    arrs = [np.array(i)[np.newaxis] for i in idxs] 
    preds = model.predict(arrs)
    preds_idxs = np.argmax(preds)
    return chars[preds_idxs]

In [41]:
get_next('zzz')

' '

In [42]:
get_next(' th')

' '

In [43]:
get_next(' an')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [81]:
cs = 8 

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [82]:
c_in_data = [[text_idxs[i+n] for i in range(0, len(text_idxs) - (cs+1), cs)] for n in range(cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [83]:
c_out_data = [text_idxs[i+cs] for i in range(0, len(text_idxs) - (cs+1), cs)]

In [84]:
xs = [np.array(c[:-2]) for c in c_in_data]

In [85]:
len(xs), xs[0].shape

(8, (75109,))

In [86]:
y = np.array(c_out_data[:-2])

So each column below is one series of 8 characters from the text.

In [87]:
[xs[n][:cs] for n in range(cs)]

[array([43,  1, 36,  2, 46, 41, 47,  2]),
 array([45,  1, 41, 47,  2,  9, 35, 47]),
 array([32, 46, 34, 45, 28,  9, 32, 35]),
 array([33, 48,  2, 48,  2, 50, 41, 32]),
 array([28, 43, 47, 47, 50, 35, 24, 45]),
 array([30, 43, 35, 35, 42, 28,  2, 32]),
 array([32, 42, 28,  2, 40, 47, 36,  2]),
 array([ 1, 46, 47, 36, 28,  2, 46, 41])]

...and this is the next character after each sequence.

In [88]:
y[:cs]

array([ 1, 36,  2, 46, 41, 47,  2, 42])

In [89]:
n_fac = 42

### Create and train model

In [90]:
def embedding_input(name, n_in, n_out):
    inp = Input((1, ), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [91]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [92]:
n_hidden = 256

In [93]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', kernel_initializer='identity')
dense_out = Dense(vocab_size, activation='softmax')

The first character of each sequence goes through dense_in(), to create our first hidden activations.

In [94]:
hidden = dense_in(c_ins[0][1])

Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.

In [95]:
for i in range(1, cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = Add()([c_dense, hidden])

Putting the final hidden state through dense_out() gives us our output.

In [96]:
c_out = dense_out(hidden)

So now we can create our model.

In [97]:
model = Model([c[0] for c in c_ins], c_out)

In [98]:
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy')

In [62]:
model.fit(xs, y, batch_size=batch_size, epochs=12, verbose=2)

Epoch 1/12
 - 11s - loss: 2.4520
Epoch 2/12
 - 11s - loss: 2.2031
Epoch 3/12
 - 11s - loss: 2.1120
Epoch 4/12
 - 11s - loss: 2.0458
Epoch 5/12
 - 11s - loss: 1.9956
Epoch 6/12
 - 11s - loss: 1.9506
Epoch 7/12
 - 11s - loss: 1.9137
Epoch 8/12
 - 11s - loss: 1.8822
Epoch 9/12
 - 11s - loss: 1.8507
Epoch 10/12
 - 11s - loss: 1.8261
Epoch 11/12
 - 11s - loss: 1.8016
Epoch 12/12
 - 11s - loss: 1.7812


<keras.callbacks.History at 0x7f44aeb00b70>

### Test model

In [63]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    #arrs = [np.array(i).reshape(1,) for i in idxs] # to fit in the Input() input shape
    arrs = [np.array(i)[np.newaxis] for i in idxs] 
    preds = model.predict(arrs)
    preds_idxs = np.argmax(preds)
    return chars[preds_idxs]

In [64]:
get_next('for thos')

' '

In [65]:
get_next('part of ')

't'

In [66]:
get_next('queens a')

'n'

## Our first RNN with keras!

In [99]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 58)

This is nearly exactly equivalent to the RNN we built ourselves in the previous section.

In [100]:
from keras.layers import SimpleRNN

In [101]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs),
    SimpleRNN(n_hidden, activation='relu', recurrent_initializer='identity'),
    Dense(vocab_size, activation='softmax')
])

In [102]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 8, 42)             2436      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 256)               76544     
_________________________________________________________________
dense_11 (Dense)             (None, 58)                14906     
Total params: 93,886
Trainable params: 93,886
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.compile(Adam(), loss='sparse_categorical_crossentropy')

In [72]:
model.fit(np.concatenate(xs, axis=1), y, batch_size=batch_size, epochs=8, verbose=2)

Epoch 1/8
 - 8s - loss: 2.6707
Epoch 2/8
 - 8s - loss: 2.2110
Epoch 3/8
 - 8s - loss: 2.0270
Epoch 4/8
 - 8s - loss: 1.8919
Epoch 5/8
 - 8s - loss: 1.7947
Epoch 6/8
 - 8s - loss: 1.7163
Epoch 7/8
 - 8s - loss: 1.6542
Epoch 8/8
 - 8s - loss: 1.6029


<keras.callbacks.History at 0x7f44ade06be0>

In [73]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    #arrs = [np.array(i).reshape(1,) for i in idxs] # to fit in the Input() input shape
    arrs = np.array(idxs)[np.newaxis,:] 
    preds = model.predict(arrs)[0]
    preds_idxs = np.argmax(preds)
    return chars[preds_idxs]

In [74]:
get_next_keras('this is ')

't'

In [75]:
get_next_keras('part of ')

't'

In [76]:
get_next_keras('queens a')

'n'

## Returning sequences

### Create inputs

To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)

Here, c_out_dat is identical to c_in_dat, but moved across 1 character.

In [109]:
#c_in_data = [[text_idxs[i+n] for i in range(0, len(text_idxs) - (cs+1), cs)] for n in range(cs)]

# c_out_data = [text_idxs[i+cs] for i in range(0, len(text_idxs) - (cs+1), cs)]
c_out_data = [[text_idxs[i+n] for i in range(1, len(text_idxs) - cs, cs)] for n in range(cs)] 

In [110]:
ys = [np.array(c[:-2]) for c in c_out_data]

Reading down each column shows one set of inputs and outputs.

In [111]:
[xs[n][:cs] for n in range(cs)]

[array([43,  1, 36,  2, 46, 41, 47,  2]),
 array([45,  1, 41, 47,  2,  9, 35, 47]),
 array([32, 46, 34, 45, 28,  9, 32, 35]),
 array([33, 48,  2, 48,  2, 50, 41, 32]),
 array([28, 43, 47, 47, 50, 35, 24, 45]),
 array([30, 43, 35, 35, 42, 28,  2, 32]),
 array([32, 42, 28,  2, 40, 47, 36,  2]),
 array([ 1, 46, 47, 36, 28,  2, 46, 41])]

In [112]:
[ys[n][:cs] for n in range(cs)]

[array([45,  1, 41, 47,  2,  9, 35, 47]),
 array([32, 46, 34, 45, 28,  9, 32, 35]),
 array([33, 48,  2, 48,  2, 50, 41, 32]),
 array([28, 43, 47, 47, 50, 35, 24, 45]),
 array([30, 43, 35, 35, 42, 28,  2, 32]),
 array([32, 42, 28,  2, 40, 47, 36,  2]),
 array([ 1, 46, 47, 36, 28,  2, 46, 41]),
 array([ 1, 36,  2, 46, 41, 47,  2, 42])]

### Create and train model

In [116]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', kernel_initializer='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

We're going to pass a vector of all zeros as our starting point - here's our input layers for that:

In [117]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [118]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = Add()([c_dense, hidden])
    # every layer now has an output
    outs.append(dense_out(hidden))

In [119]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(Adam(), loss='sparse_categorical_crossentropy')

In [124]:
#zeros = np.tile(np.zeros(n_fac), (len(xs[0]), 1))
zeros = np.zeros((len(xs[0]), n_fac))
zeros.shape

(75109, 42)

In [126]:
model.fit([zeros]+xs, ys, batch_size=batch_size, epochs=12, verbose=2)

Epoch 1/12
 - 20s - loss: 16.1867 - output_loss_1: 2.4389 - output_loss_2: 2.2892 - output_loss_3: 2.0801 - output_loss_4: 1.9468 - output_loss_5: 1.8776 - output_loss_6: 1.8557 - output_loss_7: 1.8599 - output_loss_8: 1.8385
Epoch 2/12
 - 20s - loss: 16.0694 - output_loss_1: 2.4367 - output_loss_2: 2.2869 - output_loss_3: 2.0738 - output_loss_4: 1.9339 - output_loss_5: 1.8593 - output_loss_6: 1.8316 - output_loss_7: 1.8338 - output_loss_8: 1.8134
Epoch 3/12
 - 20s - loss: 15.9717 - output_loss_1: 2.4362 - output_loss_2: 2.2858 - output_loss_3: 2.0680 - output_loss_4: 1.9212 - output_loss_5: 1.8424 - output_loss_6: 1.8138 - output_loss_7: 1.8130 - output_loss_8: 1.7914
Epoch 4/12
 - 20s - loss: 15.8948 - output_loss_1: 2.4350 - output_loss_2: 2.2839 - output_loss_3: 2.0676 - output_loss_4: 1.9113 - output_loss_5: 1.8277 - output_loss_6: 1.7971 - output_loss_7: 1.7978 - output_loss_8: 1.7744
Epoch 5/12
 - 20s - loss: 15.8254 - output_loss_1: 2.4346 - output_loss_2: 2.2829 - output_loss_

<keras.callbacks.History at 0x7f44acc4dfd0>

### Test model

In [127]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    preds = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return [chars[np.argmax(p)] for p in preds]

In [128]:
get_nexts(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 't', ' ', 's', 'n', ' ']

In [129]:
get_nexts(' part of')

[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']


['t', 'o', 'r', 'n', ' ', 'o', 'f', ' ']

### Sequence model with keras

In [130]:
n_hidden, n_fac, cs, vocab_size

(256, 42, 8, 58)

To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.

In [134]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs),
    SimpleRNN(n_hidden, return_sequences=True, activation='relu', recurrent_initializer='identity'),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

In [135]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 8, 42)             2436      
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 8, 256)            76544     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 8, 58)             14906     
Total params: 93,886
Trainable params: 93,886
Non-trainable params: 0
_________________________________________________________________


In [136]:
model.compile(Adam(), loss='sparse_categorical_crossentropy')

In [142]:
xs[0].shape, np.squeeze(xs).shape

((75109,), (8, 75109))

In [150]:
x_rnn = np.stack(np.squeeze(xs), axis=1)
y_rnn = np.atleast_3d(np.stack(np.squeeze(ys), axis=1))

In [151]:
x_rnn.shape, y_rnn.shape

((75109, 8), (75109, 8, 1))

In [152]:
model.fit(x_rnn, y_rnn, batch_size=batch_size, epochs=8, verbose=2)

Epoch 1/8
 - 9s - loss: 2.3254
Epoch 2/8
 - 8s - loss: 1.9355
Epoch 3/8
 - 8s - loss: 1.8328
Epoch 4/8
 - 8s - loss: 1.7782
Epoch 5/8
 - 8s - loss: 1.7437
Epoch 6/8
 - 8s - loss: 1.7200
Epoch 7/8
 - 8s - loss: 1.7011
Epoch 8/8
 - 8s - loss: 1.6865


<keras.callbacks.History at 0x7f4479c6ba58>

In [160]:
def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp]
    #arrs = [np.array(i).reshape(1,) for i in idxs] # to fit in the Input() input shape
    arrs = np.array(idxs)[np.newaxis,:] 
    preds = model.predict(arrs)[0]
    print(list(inp))
    return [chars[np.argmax(p)] for p in preds]

In [161]:
get_nexts_keras(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 's', ' ', 'c', 'n', ' ']