We have downloaded and saved the text as textfile.txt in the working directory. The text can be downloaded from the following link https://www.gutenberg.org/files/11/11-0.txt

In [17]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import sys

In [2]:
filename = 'textfile.txt'
raw_text = open(filename).read().lower()

In [3]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [4]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  163817
Total Vocab:  60


In [5]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  163717


In [6]:
dataX #It is a sequence of 100 characters with 101st character appended to y

[[45,
  47,
  44,
  39,
  34,
  32,
  49,
  1,
  36,
  50,
  49,
  34,
  43,
  31,
  34,
  47,
  36,
  57,
  48,
  1,
  30,
  41,
  38,
  32,
  34,
  57,
  48,
  1,
  30,
  33,
  51,
  34,
  43,
  49,
  50,
  47,
  34,
  48,
  1,
  38,
  43,
  1,
  52,
  44,
  43,
  33,
  34,
  47,
  41,
  30,
  43,
  33,
  9,
  1,
  31,
  54,
  1,
  41,
  34,
  52,
  38,
  48,
  1,
  32,
  30,
  47,
  47,
  44,
  41,
  41,
  0,
  0,
  49,
  37,
  38,
  48,
  1,
  34,
  31,
  44,
  44,
  40,
  1,
  38,
  48,
  1,
  35,
  44,
  47,
  1,
  49,
  37,
  34,
  1,
  50,
  48,
  34,
  1,
  44,
  35],
 [47,
  44,
  39,
  34,
  32,
  49,
  1,
  36,
  50,
  49,
  34,
  43,
  31,
  34,
  47,
  36,
  57,
  48,
  1,
  30,
  41,
  38,
  32,
  34,
  57,
  48,
  1,
  30,
  33,
  51,
  34,
  43,
  49,
  50,
  47,
  34,
  48,
  1,
  38,
  43,
  1,
  52,
  44,
  43,
  33,
  34,
  47,
  41,
  30,
  43,
  33,
  9,
  1,
  31,
  54,
  1,
  41,
  34,
  52,
  38,
  48,
  1,
  32,
  30,
  47,
  47,
  44,
  41,
  41,
  0,
  0,
 

In [7]:
dataY #It is the 101st character after 100 characters in X variable

[1,
 30,
 43,
 54,
 44,
 43,
 34,
 1,
 30,
 43,
 54,
 52,
 37,
 34,
 47,
 34,
 1,
 30,
 49,
 1,
 43,
 44,
 1,
 32,
 44,
 48,
 49,
 1,
 30,
 43,
 33,
 1,
 52,
 38,
 49,
 37,
 0,
 30,
 41,
 42,
 44,
 48,
 49,
 1,
 43,
 44,
 1,
 47,
 34,
 48,
 49,
 47,
 38,
 32,
 49,
 38,
 44,
 43,
 48,
 1,
 52,
 37,
 30,
 49,
 48,
 44,
 34,
 51,
 34,
 47,
 11,
 1,
 1,
 54,
 44,
 50,
 1,
 42,
 30,
 54,
 1,
 32,
 44,
 45,
 54,
 1,
 38,
 49,
 9,
 1,
 36,
 38,
 51,
 34,
 1,
 38,
 49,
 1,
 30,
 52,
 30,
 54,
 1,
 44,
 47,
 0,
 47,
 34,
 10,
 50,
 48,
 34,
 1,
 38,
 49,
 1,
 50,
 43,
 33,
 34,
 47,
 1,
 49,
 37,
 34,
 1,
 49,
 34,
 47,
 42,
 48,
 1,
 44,
 35,
 1,
 49,
 37,
 34,
 1,
 45,
 47,
 44,
 39,
 34,
 32,
 49,
 1,
 36,
 50,
 49,
 34,
 43,
 31,
 34,
 47,
 36,
 1,
 41,
 38,
 32,
 34,
 43,
 48,
 34,
 1,
 38,
 43,
 32,
 41,
 50,
 33,
 34,
 33,
 0,
 52,
 38,
 49,
 37,
 1,
 49,
 37,
 38,
 48,
 1,
 34,
 31,
 44,
 44,
 40,
 1,
 44,
 47,
 1,
 44,
 43,
 41,
 38,
 43,
 34,
 1,
 30,
 49,
 1,
 52,
 52,
 52,
 11,
 36,

In [8]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [9]:
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
X

array([[[0.75      ],
        [0.78333333],
        [0.73333333],
        ...,
        [0.01666667],
        [0.73333333],
        [0.58333333]],

       [[0.78333333],
        [0.73333333],
        [0.65      ],
        ...,
        [0.73333333],
        [0.58333333],
        [0.01666667]],

       [[0.73333333],
        [0.65      ],
        [0.56666667],
        ...,
        [0.58333333],
        [0.01666667],
        [0.5       ]],

       ...,

       [[0.73333333],
        [0.01666667],
        [0.61666667],
        ...,
        [0.73333333],
        [0.66666667],
        [0.8       ]],

       [[0.01666667],
        [0.61666667],
        [0.56666667],
        ...,
        [0.66666667],
        [0.8       ],
        [0.18333333]],

       [[0.61666667],
        [0.56666667],
        [0.68333333],
        ...,
        [0.8       ],
        [0.18333333],
        [0.        ]]])

In [11]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# fit the model
model.fit(X, y, epochs=20, batch_size=256, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 3.05392, saving model to weights-improvement-01-3.0539.hdf5
Epoch 2/20

Epoch 00002: loss improved from 3.05392 to 2.88478, saving model to weights-improvement-02-2.8848.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.88478 to 2.81685, saving model to weights-improvement-03-2.8169.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.81685 to 2.75813, saving model to weights-improvement-04-2.7581.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.75813 to 2.70490, saving model to weights-improvement-05-2.7049.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.70490 to 2.65745, saving model to weights-improvement-06-2.6574.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.65745 to 2.61544, saving model to weights-improvement-07-2.6154.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.61544 to 2.57255, saving model to weights-improvement-08-2.5726.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.57255 to 2.53410, saving model to weig

<keras.callbacks.History at 0x13c16eb38>

In [18]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [19]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ping so close to her: first, because the
duchess was very ugly; and secondly, because she was exactl "
y an alr to tee the pabte  and the wai io the wait on ti the woile saster an the was oo the tast oi the tooee tart oe the tooee tar an alr ho the tooee tar an alr aarir an alr ho the tabd to the tas oo the tart 
and the was aolin to tee toet an an aeren an alrc and the was oo the tast 
and the was aolin to the wooee tas an anl aeriri to the whil wo the whi sooe th the tooee tar an alr ho the tooee tar an alr the tabd to the whel wo the whi sooe th the tooee th the wooee tas an all the tabd to the whil wo the whe sabt to tea ant aoo aoo an an anrleisg toee  ‘he wou de wou dane in the woile ’hu  the mast an a lote of the toae-’ 
‘ie doust sai ae a lort of the toit,’ said alice. 
‘ie courtes then ’ said the marte rabtit. 
‘io whu aan no toen i sai ’o tea ’hu ’hu ’he fan en a loer tu tee the soiee to the whi woil  and the toite oe the woile  and the soiee to the sooer oa thin h se