In [1]:
#import dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [2]:
file = open("frankenstein-2.txt", encoding="utf8").read()

In [3]:
#tokenize the text
# if the created token isn't in the stop words, make it part of "filtered"
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)
processed_inputs = tokenize_words(file)

In [4]:
#converting charecters into numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [5]:
#checking the total number of charecters and different types of vocabularies
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 269995
Total vocab: 43


In [6]:
#defining lenght of an induvidual sequence
seq_length = 100
x_data = []
y_data = []

In [7]:
#creating sequences
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]

    out_seq = processed_inputs[i + seq_length]

    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 269895


In [8]:
#converting into an numpy array
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [9]:
# one hot-encoding
y = np_utils.to_categorical(y_data)

In [10]:
#dropuout to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [11]:
#model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
#setting a checkpoint since the model takes quite a while to train
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

## for 4 epochs and a batch_size of 256

In [13]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4
Epoch 00001: loss improved from inf to 2.91398, saving model to model_weights_saved.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.91398 to 2.65575, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.65575 to 2.52454, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 00004: loss improved from 2.52454 to 2.40894, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x22d9fd0b0c8>

In [14]:
#recompile model with saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
# output of the model back into charecters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [16]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" es much tranquillise mind steady purpose point soul may fix intellectual eye expedition favourite dr "


In [17]:
#generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eated seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare sear

## for 60 epochs and a batch_size of 256

In [18]:
model.fit(X, y, epochs=60, batch_size=256, callbacks=desired_callbacks)

Epoch 1/60
Epoch 00001: loss improved from 2.40894 to 2.31170, saving model to model_weights_saved.hdf5
Epoch 2/60
Epoch 00002: loss improved from 2.31170 to 2.23569, saving model to model_weights_saved.hdf5
Epoch 3/60
Epoch 00003: loss improved from 2.23569 to 2.17204, saving model to model_weights_saved.hdf5
Epoch 4/60
Epoch 00004: loss improved from 2.17204 to 2.11460, saving model to model_weights_saved.hdf5
Epoch 5/60
Epoch 00005: loss improved from 2.11460 to 2.06350, saving model to model_weights_saved.hdf5
Epoch 6/60
Epoch 00006: loss improved from 2.06350 to 2.02359, saving model to model_weights_saved.hdf5
Epoch 7/60
Epoch 00007: loss improved from 2.02359 to 1.98524, saving model to model_weights_saved.hdf5
Epoch 8/60
Epoch 00008: loss improved from 1.98524 to 1.95415, saving model to model_weights_saved.hdf5
Epoch 9/60
Epoch 00009: loss improved from 1.95415 to 1.92704, saving model to model_weights_saved.hdf5
Epoch 10/60
Epoch 00010: loss improved from 1.92704 to 1.90228, 

Epoch 00034: loss improved from 1.65259 to 1.64417, saving model to model_weights_saved.hdf5
Epoch 35/60
Epoch 00035: loss improved from 1.64417 to 1.64037, saving model to model_weights_saved.hdf5
Epoch 36/60
Epoch 00036: loss improved from 1.64037 to 1.63535, saving model to model_weights_saved.hdf5
Epoch 37/60
Epoch 00037: loss improved from 1.63535 to 1.63110, saving model to model_weights_saved.hdf5
Epoch 38/60
Epoch 00038: loss improved from 1.63110 to 1.62621, saving model to model_weights_saved.hdf5
Epoch 39/60
Epoch 00039: loss improved from 1.62621 to 1.62205, saving model to model_weights_saved.hdf5
Epoch 40/60
Epoch 00040: loss improved from 1.62205 to 1.61611, saving model to model_weights_saved.hdf5
Epoch 41/60
Epoch 00041: loss improved from 1.61611 to 1.61325, saving model to model_weights_saved.hdf5
Epoch 42/60
Epoch 00042: loss improved from 1.61325 to 1.60995, saving model to model_weights_saved.hdf5
Epoch 43/60
Epoch 00043: loss improved from 1.60995 to 1.60541, sav

<tensorflow.python.keras.callbacks.History at 0x22de8c65948>

In [19]:
#recompile model with saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
# output of the model back into charecters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [21]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" ife one turnkeys countenance expressed bad qualities often characterise class lines face hard rude l "


In [23]:
#generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

 see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see