In [1]:
import os
import traceback

import numpy as np
import random as  rnd

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input



In [2]:
#%pip install numpy==1.26.3

In [15]:
dirname = 'data/'
filename = 'shakespeare_data.txt'
lines = [] # storing all the lines in a variable. 

counter = 0

with open(os.path.join(dirname, filename)) as files:
    for line in files:        
        # remove leading and trailing whitespace
        pure_line = line.strip()#.lower()

        # if pure_line is not the empty string,
        if pure_line:
            # append it to the list
            lines.append(pure_line)
            
n_lines = len(lines)
print(f"Number of lines: {n_lines}")

Number of lines: 125097


In [16]:
text = "\n".join(lines)
# The unique characters in the file
vocab = sorted(set(text))
vocab.insert(0,"[UNK]") # Add a special character for any unknown
vocab.insert(1,"") # Add the empty character for padding.

print(f'{len(vocab)} unique characters')
print(" ".join(vocab))

82 unique characters
[UNK]  	 
   ! $ & ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z |


In [17]:
def line_to_tensor(line, vocab):
    chars = tf.strings.unicode_split(line, input_encoding='UTF-8')
    ids = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)(chars)
    return ids

In [18]:
def text_from_ids(ids, vocab):
    chars_from_ids = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True, mask_token=None)
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [19]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [20]:
def create_batch_dataset(lines, vocab, seq_length=100, batch_size=64):
    BUFFER_SIZE = 10000
    
    single_line_data  = "\n".join(lines)
    all_ids = line_to_tensor(single_line_data, vocab)
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True) 
    dataset_xy = data_generator.map(split_input_target)
    dataset = (                                   
        dataset_xy                                
        .shuffle(seq_length)
        .batch(batch_size, drop_remainder=True)
        .prefetch(tf.data.experimental.AUTOTUNE)  
        )            
                                     
    return dataset

In [21]:
train_lines = lines[:-1000] # Leave the rest for training
eval_lines = lines[-1000:] # Create a holdout validation set

print(f"Number of training lines: {len(train_lines)}")
print(f"Number of validation lines: {len(eval_lines)}")

Number of training lines: 124097
Number of validation lines: 1000


In [22]:
# Batch size
BATCH_SIZE = 64
dataset = create_batch_dataset(train_lines, vocab, seq_length=250, batch_size=BATCH_SIZE)

In [85]:
class GRULM(tf.keras.Model):
    def __init__(self, vocab_size=256, embedding_dim=256, rnn_units=128, num_layers=2):
        super(GRULM, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.gru_layers = [
            tf.keras.layers.GRU(units=rnn_units, return_sequences=True, return_state=True)
            for _ in range(num_layers)
        ]
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)
        new_states = []
        for i, gru in enumerate(self.gru_layers):
            state = states[i] if states is not None else None
            x, state = gru(x, initial_state=state, training=training)
            new_states.append(state)
        logits = self.dense(x, training=training)
        if return_state:
            return logits, new_states
        return logits

In [98]:
def compile_model(model):
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    opt = tf.keras.optimizers.Adam(learning_rate=0.00125)
    model.compile(optimizer=opt, loss=loss)
    sample_input = tf.random.uniform((BATCH_SIZE, 100))  
    model(sample_input)
    model.summary()
    return model

In [99]:
model = GRULM(
    vocab_size=len(vocab),
    embedding_dim=256,
    rnn_units=512,
    num_layers=1
)

In [None]:
EPOCHS = 10

model = compile_model(model)
history = model.fit(dataset, epochs=EPOCHS)

tf.random.set_seed(272)
gen = GenerativeModel(model, vocab, temperature=0.5)

print(gen.generate_n_chars(32, " "), '\n\n' + '_'*80)
print(gen.generate_n_chars(32, "Dear"), '\n\n' + '_'*80)
print(gen.generate_n_chars(32, "KING"), '\n\n' + '_'*80)

Epoch 1/10
[1m317/317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 895ms/step - loss: 2.5139
Epoch 2/10
[1m317/317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 901ms/step - loss: 1.6665
Epoch 3/10
[1m317/317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m698s[0m 2s/step - loss: 1.5025
Epoch 4/10
[1m286/317[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m49s[0m 2s/step - loss: 1.4257