In [1]:
import os
import traceback

import numpy as np
import random as  rnd

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from termcolor import colored

# set random seed
rnd.seed(32)

### Loading the Data

In [2]:
dirname = '/content/'
filename = 'shakespeare.txt'
lines = [] # storing all the lines in a variable.

counter = 0

with open(os.path.join(dirname, filename)) as files:
    for line in files:
        # remove leading and trailing whitespace
        pure_line = line.strip()#.lower()

        # if pure_line is not the empty string,
        if pure_line:
            # append it to the list
            lines.append(pure_line)

n_lines = len(lines)
print(f"Number of lines: {n_lines}")

Number of lines: 6673


In [3]:
print("\n".join(lines[506:514]))

We'll chide this Dauphin at his father's door.
Therefore let every man now task his thought,
That this fair action may on foot be brought.
Is it for fear to wet a widow's eye,
That thou consum'st thy self in single life?
Ah, if thou issueless shalt hap to die,
The world will wail thee like a makeless wife,
The world will be thy widow and still weep,


### Create the vocabulary

### Create unique character

In [8]:
text = "\n".join(lines)

# The unique characters in the file
vocab = sorted(set(text))
# Add a special character for any unknown
vocab.insert(0, "[UNK]")
# Add the empty character for padding
vocab.insert(1, "")

print(f"{len(vocab)} unique characters")
print(" ".join(vocab))

81 unique characters
[UNK]  
   ! " ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; < > ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z


### Convert a line to tensor

In [9]:
# use tf.strings.unicode_split to split the text into characters.
line = "Hello world!"
chars = tf.strings.unicode_split(line, input_encoding = "UTF-8")
print(chars)

tf.Tensor([b'H' b'e' b'l' b'l' b'o' b' ' b'w' b'o' b'r' b'l' b'd' b'!'], shape=(12,), dtype=string)


In [11]:
print(vocab.index('a'))
print(vocab.index('e'))
print(vocab.index('i'))
print(vocab.index('o'))
print(vocab.index('u'))
print(vocab.index(' '))
print(vocab.index('2'))
print(vocab.index('3'))

55
59
63
69
75
3
14
15


In [12]:
# Tensorflow has a function tf.keras.layers.StringLookup that does this efficiently for list of characters.
# Note that the output object is of type tf.Tensor. Here is the result of applying the StringLookup function
# to the characters of "Hello world"

# The mask_token parameter specifies a token that should be considered as a special mask token. This token is usually
# used to indicate padding or a special state in your data.
ids = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)(chars)
print(ids)

tf.Tensor([34 59 66 66 69  3 77 69 72 66 58  4], shape=(12,), dtype=int64)


### line_to_tensor

In [14]:
# akes in a single line and transforms each character into its unicode integer.
# This returns a list of integers, which we'll refer to as a tensor.
def line_to_tensors(line, vocab):
  chars = tf.strings.unicode_split(line, input_encoding = "UTF-8")

  ids = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)(chars)

  return ids

### Function produces text given a numeric tensor

In [16]:
def text_from_ids(ids, vocab):
  # Initialize the StringLookup Layer to map integer IDs back to characters
  chars_from_ids = tf.keras.layers.StringLookup(
      vocabulary = vocab,
      invert = True,
      mask_token = None
  )

  # Use the layer to decode the tensor of IDs into human-readable text
  return tf.strings.reduce_join(chars_from_ids(ids), axis = -1)

In [17]:
text_from_ids(ids, vocab).numpy()

b'Hello world!'

### Prepare data for training and testing

In [19]:
train_lines = lines [:-1000]
eval_lines = lines[-1000:]

print(f"Number of training lines : {len(train_lines)}")
print(f"Number of validation lines : {len(eval_lines)}")

Number of training lines : 5673
Number of validation lines : 1000


### TensorFlow dataset

In [22]:
all_ids = line_to_tensors(
    "\n".join(["Hello world!", "Generative AI"]),
    vocab
)

all_ids

<tf.Tensor: shape=(26,), dtype=int64, numpy=
array([34, 59, 66, 66, 69,  3, 77, 69, 72, 66, 58,  4,  2, 33, 59, 68, 59,
       72, 55, 74, 63, 76, 59,  3, 27, 35])>

In [26]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print([text_from_ids([ids], vocab).numpy() for ids in ids_dataset.take(10)])

[b'H', b'e', b'l', b'l', b'o', b' ', b'w', b'o', b'r', b'l']


In [27]:
# onfigure this dataset to produce batches of the same size each time
seq_length = 10
data_generator = ids_dataset.batch(seq_length + 1, drop_remainder = True)

In [28]:
for seq in data_generator.take(2):
  print(seq)

tf.Tensor([34 59 66 66 69  3 77 69 72 66 58], shape=(11,), dtype=int64)
tf.Tensor([ 4  2 33 59 68 59 72 55 74 63 76], shape=(11,), dtype=int64)


In [29]:
i = 1
for seq in data_generator.take(2):
  print(f"{i}. {text_from_ids(seq, vocab).numpy()}")
  i = i + 1

1. b'Hello world'
2. b'!\nGenerativ'


### Create the input and the output for the model

In [30]:
# The following function creates 2 tensors, each with a length of seq_length out of the input sequence of
# lenght seq_length + 1. The first one contains the first seq_length elements and the second one contains
# the last seq_length elements. For example, if you split the sequence ['H', 'e', 'l', 'l', 'o'], you will
# obtain the sequences ['H', 'e', 'l', 'l'] and ['e', 'l', 'l', 'o'].
def split_input_target(sequence):
  # Create the input sequence by excluding the last char
  input_text = sequence[:-1]

  # Create the target_sequence by excluding the first char
  target_text = sequence[1:]

  return input_text, target_text

In [31]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

### Create data_generator

In [34]:
def create_batch_dataset(lines, vocab, seq_length, batch_size = 64):
  # Buffer size to shuffle the dataset
  BUFFER_SIZE = 10000

  # For simplicity, join all lines into a single line
  single_line_data = "\n".join(lines)

  # Convert data into tensor using the given vocab
  all_ids = line_to_tensors(single_line_data, vocab)

  # Create a Tensorflow dataset from the data tensor
  ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

  # Create a batch dataset
  data_generator = ids_dataset.batch(seq_length + 1, drop_remainder = True)

  # Map each input sample using split_input_target function
  dataset_xy = data_generator.map(split_input_target)

  dataset = (
      dataset_xy
      .shuffle(BUFFER_SIZE)
      .batch(batch_size, remainder = True)
      .prefetch(tf.data.experimental.AUTOTUNE)
  )

  return dataset