In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

#### Get data and convert to UTF-8

In [None]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
dataset = tf.keras.utils.get_file("shakespeare.txt", url)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
text = open(dataset, "rb").read().decode("utf-8")

print(f"Num of characters:{len(text)}")
print("-" * 50)
print(text[:300])

Num of characters:1115394
--------------------------------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us


#### Create a dictionary mapping chars to indices

In [None]:
vocab = sorted(set(text.lower())) # returns a list with unique characters sorted alphabetically
ids_to_char = np.array(vocab) # convert to numpy array
char_to_ids = {char:i for i, char in enumerate(ids_to_char)} #dictionary of chars to indexes

Convert each character in text to its interger value in the vocabulary

In [None]:
text_ids = np.array([char_to_ids[char] for char in text.lower()])
print(text_ids[:200])

[18 21 30 31 32  1 15 21 32 21 38 17 26 10  0 14 17 18 27 30 17  1 35 17
  1 28 30 27 15 17 17 16  1 13 26 37  1 18 33 30 32 20 17 30  6  1 20 17
 13 30  1 25 17  1 31 28 17 13 23  8  0  0 13 24 24 10  0 31 28 17 13 23
  6  1 31 28 17 13 23  8  0  0 18 21 30 31 32  1 15 21 32 21 38 17 26 10
  0 37 27 33  1 13 30 17  1 13 24 24  1 30 17 31 27 24 34 17 16  1 30 13
 32 20 17 30  1 32 27  1 16 21 17  1 32 20 13 26  1 32 27  1 18 13 25 21
 31 20 12  0  0 13 24 24 10  0 30 17 31 27 24 34 17 16  8  1 30 17 31 27
 24 34 17 16  8  0  0 18 21 30 31 32  1 15 21 32 21 38 17 26 10  0 18 21
 30 31 32  6  1 37 27 33]


#### Split data into chunks - creating sequences of certain length

The sequence is the input. Label is one place shifted right.

In [None]:
def split_data(chunk):
  input_chunk = chunk[:-1] #everything but last character in this chunk
  label_chunk = chunk[1:] #label to char index 0 in chunk
  return input_chunk, label_chunk

In [None]:
SEQUENCE_LEN = 64 # add 1 to this length because chunk needs to bring 64 characters
BUFFER_SIZE = 10000

samples = tf.data.Dataset.from_tensor_slices(text_ids).batch(SEQUENCE_LEN + 1, drop_remainder=True).map(split_data).shuffle(BUFFER_SIZE)

Split into training and test sets

In [None]:
BATCH_SIZE = 64

test_size = 0.2
num_samples = sum(1 for x in samples)
num_training_samples = int(num_samples * (1 - test_size))

train_ds = samples.take(num_training_samples)
test_ds = samples.skip(num_training_samples)

train_ds = train_ds.shuffle(BUFFER_SIZE).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.shuffle(BUFFER_SIZE).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
train_ds, test_ds

(<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>,
 <_PrefetchDataset element_spec=(TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>)

#### Model Architecture

In [None]:
from keras import layers, models
from keras import losses, optimizers, metrics

In [None]:
EMBED_DIM = 64
VOCAB_SIZE = len(vocab)

In [None]:
model = models.Sequential([
    layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
    layers.Dense(VOCAB_SIZE, activation='softmax')
])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 64)          2496      
                                                                 
 bidirectional_4 (Bidirecti  (None, None, 64)          24832     
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, None, 39)          2535      
                                                                 
Total params: 29863 (116.65 KB)
Trainable params: 29863 (116.65 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer=optimizers.RMSprop(),
              metrics=metrics.SparseCategoricalAccuracy())
model.fit(train_ds,
          epochs=10,
          validation_data=test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b1d80309a80>

#### Predictions

DRAFT - Work in progress

Couldn't get the model to output properly..maybe its the shape of the predictions???

In [None]:
from keras.utils import pad_sequences

In [None]:
def generate_chars(seed_text, next_chars, model, sequence_len):
  for _ in range(next_chars):
    text_to_index = np.array([[char_to_ids[char] for char in seed_text.lower()]])
    padded_ids = pad_sequences(text_to_index, maxlen=SEQUENCE_LEN, padding="post")
    prediction = np.argmax(model.predict(text_to_index, verbose=0)[-1][-1], axis=-1)

    output_text = ""
    for char, idx in char_to_ids.items():
      if idx == prediction:
        output_text = char
        break
    seed_text += output_text
  return seed_text

In [None]:
generate_chars("The course of true love never did run s", 20, model, SEQUENCE_LEN)

'The course of true love never did run sou the the the the t'