In [4]:
import re
import numpy as np
from IPython.display import clear_output

from tensorflow.keras.layers import Dense, LSTM, Input, Embedding, Dropout
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import LambdaCallback



In [5]:
load_save_model = False
train_model = True

In [6]:
token_type = 'word'

In [13]:
seq_length = 20

filename = "./data/aesop/data.txt"

with open(filename, encoding='utf-8-sig') as f:
    text = f.read()
    
    
#removing text before and after the main stories
start = text.find("THE FOX AND THE GRAPES\n\n\n")
end = text.find("ILLUSTRATIONS\n\n\n[")
text = text[start:end]

18358


In [12]:
start_story = '| ' * seq_length
print(start_story)
text = start_story + text
text = text.lower()
text = text.replace('\n\n\n\n\n', start_story)
text = text.replace('\n', ' ')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)

| | | | | | | | | | | | | | | | | | | | 


In [9]:
len(text)

213716

In [10]:
text

 of omen which foretold the future , and was accordingly held in great respect by them . she was very anxious to get the same sort of reputation herself ; and , one day , seeing some travellers approaching , she flew on to a branch of a tree at the roadside and cawed as loud as she could . the travellers were in some dismay at the sound , for they feared it might be a bad omen ; till one of them , spying the crow , said to his companions , " it\'s all right , my friends , we can go on without fear , for it\'s only a crow and that means nothing . " . those who pretend to be something they are not only make . themselves ridiculous . | | | | | | | | | | | | | | | | | | | | the witch . a witch professed to be able to avert the anger of the gods by means of charms , of which she alone possessed the secret ; and she drove a brisk trade , and made a fat livelihood out of it . but certain persons accused her of black magic and carried her before the judges , and demanded that she should be put

In [14]:
if token_type == 'word':
    tokenizer = Tokenizer(char_level = False, filters = '')
else:
    tokenizer = Tokenizer(char_level = True, filters = '', lower = False)
    
    
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1

token_list = tokenizer.texts_to_sequences([text])[0]

In [15]:
total_words

8315

In [16]:
print(tokenizer.word_index)
print(token_list)

4, 259, 7594, 177, 93, 4, 446, 5, 2124, 7595, 3, 37, 7596, 82, 8, 7597, 445, 7598, 2, 1, 492, 110, 7599, 15, 1, 925, 7600, 7601, 97, 7602, 3, 1, 1293, 62, 1, 60, 195, 697, 2216, 2, 776, 38, 9, 275, 173, 5, 1, 7603, 1335, 7604, 158, 465, 1064, 7605, 12, 294, 465, 1625, 7606, 1386, 2, 7, 7607, 209, 33, 7608, 572, 20, 4, 2306, 2176, 9, 1, 7609, 140, 1, 2306, 146, 8, 126, 11, 4, 537, 26, 5, 7610, 29, 4, 7611, 2, 2212, 7612, 16, 191, 1, 1368, 7613, 16, 75, 34, 27, 7614, 21, 1, 1386, 126, 2188, 6, 7615, 572, 20, 503, 2, 1308, 38, 15, 233, 1387, 7616, 12, 1, 7617, 24, 288, 16, 28, 6, 380, 67, 112, 2, 1, 26, 30, 24, 67, 7618, 6, 380, 288, 161, 1, 1386, 448, 383, 18, 33, 7619, 12, 6, 80, 51, 98, 158, 1282, 53, 15, 39, 7620, 12, 7621, 1282, 1, 2297, 2, 18, 251, 95, 10, 93, 25, 7622, 7623, 291, 7624, 3, 7625, 2, 7626, 8, 69, 865, 147, 4, 820, 284, 21, 6, 81, 342, 900, 444, 9, 1205, 5, 14, 55, 520, 3, 27, 49, 7627, 2, 19, 7628, 6, 2289, 52, 16, 15, 7, 7629, 3, 7, 1632, 7630, 131, 938, 16, 7631, 9,

In [18]:
def generate_sequence(token_list, step):
    X = []
    y = []

    for i in range(0, len(token_list) - seq_length,step):
        X.append(token_list[i:i + seq_length])
        y.append(token_list[i + seq_length])

    y = to_categorical(y, num_classes= total_words)

    num_seq = len(X)
    print('Number of sequences:', num_seq,"\n")

    return X,y,num_seq 

step = 1
seq_length = 20
X,y,num_seq = generate_sequence(token_list,step)

X = np.array(X)
y = np.array(y)

Number of sequences: 34904 



In [19]:
X.shape

(34904, 20)

In [20]:
y.shape

(34904, 8315)

In [23]:
if load_save_model:
    # model = load_model('./saved_models/lstm_aesop_1.h5')
    model = load_model('./saved_models/aesop_dropout_100.h5')

else:

    n_units = 256
    embedding_size = 100

    text_in = Input(shape = (None,))
    embedding = Embedding(total_words, embedding_size)
    x = embedding(text_in)
    x = LSTM(n_units)(x)
    # x = Dropout(0.2)(x)
    text_out = Dense(total_words, activation = 'softmax')(x)

    model = Model(text_in, text_out)

    opti = RMSprop(lr = 0.001)
    model.compile(loss='categorical_crossentropy', optimizer=opti)

In [24]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         831500    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 8315)              2136955   
Total params: 3,334,023
Trainable params: 3,334,023
Non-trainable params: 0
_________________________________________________________________
