In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import sys

In [3]:
raw_text = open(file='Book.txt', mode= 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [4]:
print(f"Total Characters In The Book Is: {len(raw_text)}")

Total Characters In The Book Is: 163918


In [5]:
chars_to_remove = {"\n", "!", "#", "$", "%", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "ù", "•", "™", "\ufeff"}

def clean_text(text, chars_to_remove):
  return ''.join(c for c in text if c not in chars_to_remove)

raw_text = clean_text(raw_text, chars_to_remove)

print(f"Total Characters In The Book Is: {len(raw_text)}")

Total Characters In The Book Is: 154374


In [6]:
chars = sorted(list(set(raw_text)))
print(f"Total No. Of Unique Characters Are: {len(chars)}")

Total No. Of Unique Characters Are: 42


In [7]:
char_to_int = dict((c, i) for i, c in enumerate(chars))
print("Characters With Their Integer Values:", char_to_int)

Characters With Their Integer Values: {' ': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15, 'f': 16, 'g': 17, 'h': 18, 'i': 19, 'j': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34, 'y': 35, 'z': 36, '—': 37, '‘': 38, '’': 39, '“': 40, '”': 41}


In [8]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  154374
Total Vocab:  42


In [9]:
seq_length = 100

dataX = []
dataY = []

In [10]:
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i : i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append([char_to_int[seq_out]])


print(f"Total Patterns: {len(dataX)}")

Total Patterns: 154274


In [11]:
X = np.reshape(dataX, [len(dataX), seq_length, 1])

# Normalisation
X = X / float(n_vocab)


y = to_categorical(dataY)

In [12]:
print(f"The X Shape is: {X.shape}")
print(f"The y Shape is: {y.shape}")

The X Shape is: (154274, 100, 1)
The y Shape is: (154274, 42)


In [13]:
model = Sequential()
model.add(Input(shape=(X.shape[1], X.shape[2])))
model.add(LSTM(units=256, return_sequences= True))
model.add(Dropout(0.2))
model.add(LSTM(units=256))
model.add(Dropout(0.2))
model.add(Dense(units=y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [14]:
filepath="Checkpoints/weights-improvement-{epoch:02d}-{loss:.4f}.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [15]:
model.fit(X, y, epochs=20, batch_size=64, callbacks=callbacks_list)

Epoch 1/20
Epoch 1: loss improved from inf to 2.75248, saving model to Checkpoints/weights-improvement-01-2.7525.keras
Epoch 2/20
Epoch 2: loss improved from 2.75248 to 2.42028, saving model to Checkpoints/weights-improvement-02-2.4203.keras
Epoch 3/20
Epoch 3: loss improved from 2.42028 to 2.21483, saving model to Checkpoints/weights-improvement-03-2.2148.keras
Epoch 4/20
Epoch 4: loss improved from 2.21483 to 2.07757, saving model to Checkpoints/weights-improvement-04-2.0776.keras
Epoch 5/20
Epoch 5: loss improved from 2.07757 to 1.97574, saving model to Checkpoints/weights-improvement-05-1.9757.keras
Epoch 6/20
Epoch 6: loss improved from 1.97574 to 1.90366, saving model to Checkpoints/weights-improvement-06-1.9037.keras
Epoch 7/20
Epoch 7: loss improved from 1.90366 to 1.84313, saving model to Checkpoints/weights-improvement-07-1.8431.keras
Epoch 8/20
Epoch 8: loss improved from 1.84313 to 1.79085, saving model to Checkpoints/weights-improvement-08-1.7909.keras
Epoch 9/20
Epoch 9: 

<keras.src.callbacks.History at 0x780a96e0eb00>

In [16]:
!zip -r /content/file.zip /content/Checkpoints

  adding: content/Checkpoints/ (stored 0%)
  adding: content/Checkpoints/weights-improvement-17-1.5149.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-01-2.7525.keras (deflated 7%)
  adding: content/Checkpoints/weights-improvement-11-1.6678.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-18-1.4978.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-09-1.7462.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-08-1.7909.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-15-1.5615.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-19-1.4804.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-12-1.6360.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-03-2.2148.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-02-2.4203.keras (deflated 6%)
  adding: content/Checkpoints/weights-improvement-13-1.6074.keras (deflated 6

In [17]:
# load the network weights
filename = "Checkpoints/weights-improvement-20-1.4678.keras"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [21]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(200):
 x = np.reshape(pattern, (1, len(pattern), 1))
 x = x / float(n_vocab)
 prediction = model.predict(x, verbose=0)
 index = np.argmax(prediction)
 result = int_to_char[index]
 seq_in = [int_to_char[value] for value in pattern]
 sys.stdout.write(result)
 pattern.append(index)
 pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
"  wwwgutenbergorgdonatesection 5 general information about project gutenberg electronic worksprofesso "
r crmien in the project gutenberg license and the project gutenberg license and the project gutenberg license and the project gutenberg license and the project gutenberg license and the project gutenb
Done.
