In [1]:
# import dependencies 
import numpy as np 
import sys

In [2]:
# load data 
file = open("frankenstein.txt").read()

In [3]:
# tokenize the words from the test file 
# standardizatoin 
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

def tokenize_words(input):
    # making the input word a lower case 
    input = input.lower()
    
    # making tokens using tokenizer 
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    
    # filtering the stop words from the tokens 
    filtered = filter(lambda token : token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# sorting the char and converting char to num 
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [5]:
# determining the length of the variabes
input_len = len(processed_inputs)
vocab_len = len(chars)
print('total number of characters : ', input_len)
print('total number of vocabs : ', vocab_len)

total number of characters :  232972
total number of vocabs :  37


In [6]:
# generating out dataset

seq_length = 100
X_data = []
y_data = []

In [7]:
# creating input sequence and output sequence 

for i in range(0, input_len - seq_length):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    # converting the sequences in numbers and adding it to our dataset
    X_data.append([char_to_num[char] for char in in_seq])
    y_data.append([char_to_num[out_seq]])

# note : try printin an inividual in_seq and out_seq for better understanding 

In [8]:
# number of patters we have 
n_patterns = len(X_data)
print('total patterns : ', n_patterns)

total patterns :  232872


In [9]:
# converting the input sequence to numpy array 
X = np.reshape(X_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [10]:
# one_hot encoding 
from keras.utils import np_utils 
y = np_utils.to_categorical(y_data)

Using TensorFlow backend.


In [11]:
# creating a sequencial model 
from keras import Sequential
from keras.layers import Dense, Dropout, LSTM
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

W0526 06:05:25.700256 140433450608448 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0526 06:05:25.857807 140433450608448 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0526 06:05:25.884356 140433450608448 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0526 06:05:26.753751 140433450608448 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.

In [12]:
# compiling the model 
model.compile(loss='categorical_crossentropy', optimizer='adam')

W0526 06:05:31.991774 140433450608448 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0526 06:05:32.017323 140433450608448 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [13]:
# saving weights
from keras.callbacks import ModelCheckpoint
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [14]:
# fit the model and let it train 
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

W0526 06:05:53.630858 140433450608448 deprecation.py:323] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/4

Epoch 00001: loss improved from inf to 2.92889, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.92889 to 2.90009, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.90009 to 2.84531, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.84531 to 2.76040, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7fb89c52a5f8>

In [15]:
# recompiling the model with the saved weights 
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [16]:
# output of the model back into the characters 
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [20]:
# generating sequence of characters using the random seed

start = np.random.randint(0, len(X_data) - 1)
pattern = X_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" vouredvainrestorelifeappearedhandsomeyoungmanfivetwentyyearsageapparentlystrangledsignviolenceexcept "


In [23]:
# generating the text 

for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

rerererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererertedrererert