<a href="https://colab.research.google.com/github/vinodgaitonde/ProjIdeas/blob/main/WordPredict_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Required packages

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import sys

Load Text

In [14]:
filename = "./sample_data/1661-split.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
print('corpus length:', len(raw_text))
print(raw_text[0:100])

corpus length: 282300
﻿
project gutenberg's the adventures of sherlock holmes, by arthur conan doyle

this ebook is for th


# Data Preparation - Clean text

In [17]:
raw_text = ''.join(c for c in raw_text if not c.isdigit())

In [18]:
#How many total characters do we have in our training text?
chars = sorted(list(set(raw_text))) #List of every character
print('total chars:', len(chars))

total chars: 56


In [19]:
#Character sequences must be encoded as integers.
#Each unique character will be assigned an integer value.
#Create a dictionary of characters mapped to integer values
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [20]:
#Do the reverse so we can print our predictions in characters and not integers
int_to_char = dict((i, c) for i, c in enumerate(chars))

Summarize the data

In [21]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters in the text; corpus length: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters in the text; corpus length:  282109
Total Vocab:  56


Create Input/Output sequence for training

In [22]:
seq_length = 60  #Length of each input sequence
step = 10   #Instead of moving 1 letter at a time, try skipping a few.
sentences = []    # X values (Sentences)
next_chars = []   # Y values. The character that follows the sentence defined as X
for i in range(0, n_chars - seq_length, step):  #step=1 means each sentence is offset just by a single letter
    sentences.append(raw_text[i: i + seq_length])  #Sequence in
    next_chars.append(raw_text[i + seq_length])  #Sequence out
n_patterns = len(sentences)
print('Number of sequences:', n_patterns)

Number of sequences: 28205


In [25]:
x = np.zeros((len(sentences), seq_length, n_vocab), dtype=bool)
y = np.zeros((len(sentences), n_vocab), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[next_chars[i]]] = 1

print(x.shape)
print(y.shape)

print(y[0:10])

(28205, 60, 56)
(28205, 56)
[[False False False False False False False False False False False False
  False False False False False False  True False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False  True False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False  True False False False False False False False 

#Basic model with one LSTM

In [27]:
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, n_vocab)))
model.add(Dense(n_vocab, activation='softmax'))

optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

  super().__init__(**kwargs)


# define the checkpoint

In [29]:
from keras.callbacks import ModelCheckpoint

filepath="saved_weights/saved_weights-{epoch:02d}-{loss:.4f}.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [checkpoint]

# Fit the model

In [30]:
history = model.fit(x, y,
          batch_size=128,
          epochs=50,
          callbacks=callbacks_list)

model.save('my_saved_weights_book_50epochs.keras')

Epoch 1/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - loss: 2.8678
Epoch 1: loss improved from inf to 2.58342, saving model to saved_weights/saved_weights-01-2.5834.keras
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 151ms/step - loss: 2.8666
Epoch 2/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - loss: 2.1825
Epoch 2: loss improved from 2.58342 to 2.13250, saving model to saved_weights/saved_weights-02-2.1325.keras
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 148ms/step - loss: 2.1823
Epoch 3/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - loss: 1.9929
Epoch 3: loss improved from 2.13250 to 1.95701, saving model to saved_weights/saved_weights-03-1.9570.keras
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 154ms/step - loss: 1.9927
Epoch 4/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - lo

