In [1]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Read input file and prints the length of the text
input_file = "dinos.txt"
with io.open(input_file, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

corpus length: 19910


In [3]:
chars = sorted(list(set(text)))
data_size, vocab_size = len(text), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 19910 total characters and 27 unique characters in your data.


In [4]:
names = text.split('\n')

In [5]:
np.random.shuffle(names)

In [6]:
max_char = len(max(names, key=len)) + 1

In [7]:
# Creates a character mapper (character to index - index to character mappings in memory)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [8]:
X = np.zeros((len(names), max_char, vocab_size))
Y = np.zeros((len(names), max_char, vocab_size))

In [9]:
for n in range(len(names)):
   i = 0
   for c in range(len(names[n]) + 1):
      # always make the first element an non character because it's how we want to train the model.
      if c > 0:
        X[n, c, char_indices[names[n][c-1]]] = 1
        Y[n, c-1, char_indices[names[n][c-1]]] = 1
   Y[n, len(names[n]), char_indices['\n']] = 1
   

In [10]:
print(X.shape)
print(Y.shape)

(1537, 27, 27)
(1537, 27, 27)


In [11]:
# build the model network architecture: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(max_char, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

Build model...


In [12]:
# Here I am using RMSprop as the optimizer, but it could be changed for something else.
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [13]:
def sample():
    stop = False
    cur_index = 0
    word = []
    x_pred = np.zeros((1, max_char, vocab_size))
    while not stop:
        probs = list(model.predict(x_pred)[0, cur_index])
        probs = probs / np.sum(probs)
        char_index = np.random.choice(range(vocab_size), p=probs)
        pred_char = indices_char[char_index]
        if pred_char == "\n" or cur_index == max_char -1:
            stop = True
            break
        word.append(pred_char)
        cur_index += 1
        x_pred[0, cur_index, char_index] = 1
    print("".join(word))

In [14]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print('----- Generating text after Epoch: %d' % epoch)
    sample()        
        


In [15]:
# Training the model, then predict the next 400 characters for every opoch
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
print(model.summary(90))

model.fit(X, Y,
          batch_size=64,
          epochs=100,
          callbacks=[print_callback])

__________________________________________________________________________________________
Layer (type)                            Output Shape                        Param #       
lstm_1 (LSTM)                           (None, 27, 64)                      23552         
__________________________________________________________________________________________
dense_1 (Dense)                         (None, 27, 27)                      1755          
Total params: 25,307
Trainable params: 25,307
Non-trainable params: 0
__________________________________________________________________________________________
None
Epoch 1/100
----- Generating text after Epoch: 0
kiatonon
Epoch 2/100
----- Generating text after Epoch: 1
xianamsalaus
Epoch 3/100
----- Generating text after Epoch: 2
megxanasaurus
Epoch 4/100
----- Generating text after Epoch: 3
xewukosaria
Epoch 5/100
----- Generating text after Epoch: 4
archadodonthus
Epoch 6/100
----- Generating text after Epoch: 5
bracothtikus
Epoch 7/1

----- Generating text after Epoch: 54
compcodon
Epoch 56/100
----- Generating text after Epoch: 55
vinabusaurus
Epoch 57/100
----- Generating text after Epoch: 56
lansasaurus
Epoch 58/100
----- Generating text after Epoch: 57
ephadan
Epoch 59/100
----- Generating text after Epoch: 58
austropelta
Epoch 60/100
----- Generating text after Epoch: 59
leytasaurus
Epoch 61/100
----- Generating text after Epoch: 60
suypan
Epoch 62/100
----- Generating text after Epoch: 61
stretinisaurus
Epoch 63/100
----- Generating text after Epoch: 62
leanassus
Epoch 64/100
----- Generating text after Epoch: 63
eiminosaurus
Epoch 65/100
----- Generating text after Epoch: 64
camarsysaurus
Epoch 66/100
----- Generating text after Epoch: 65
boharisaurus
Epoch 67/100
----- Generating text after Epoch: 66
simivomimus
Epoch 68/100
----- Generating text after Epoch: 67
sterrholophus
Epoch 69/100
----- Generating text after Epoch: 68
triasaurus
Epoch 70/100
----- Generating text after Epoch: 69
jeohenglongosaurus
Ep

<keras.callbacks.History at 0x7fa3ec7cca90>

In [16]:
print(indices_char)

{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
