In [1]:
import os
import traceback

import numpy as np
import random as  rnd

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from termcolor import colored

# set random seed
rnd.seed(32)

### Loading the Data

In [2]:
dirname = '/content/'
filename = 'shakespeare.txt'
lines = [] # storing all the lines in a variable.

counter = 0

with open(os.path.join(dirname, filename)) as files:
    for line in files:
        # remove leading and trai'ling whitespace
        pure_line = line.strip()#.lower()

        # if pure_line is not the empty string,
        if pure_line:
            # append it to the list
            lines.append(pure_line)

n_lines = len(lines)
print(f"Number of lines: {n_lines}")

Number of lines: 6673


In [3]:
print("\n".join(lines[506:514]))

We'll chide this Dauphin at his father's door.
Therefore let every man now task his thought,
That this fair action may on foot be brought.
Is it for fear to wet a widow's eye,
That thou consum'st thy self in single life?
Ah, if thou issueless shalt hap to die,
The world will wail thee like a makeless wife,
The world will be thy widow and still weep,


### Create the vocabulary

### Create unique character

In [4]:
text = "\n".join(lines)

# The unique characters in the file
vocab = sorted(set(text))
# Add a special character for any unknown
vocab.insert(0, "[UNK]")
# Add the empty character for padding
vocab.insert(1, "")

print(f"{len(vocab)} unique characters")
print(" ".join(vocab))

81 unique characters
[UNK]  
   ! " ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; < > ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z


### Convert a line to tensor

In [5]:
# use tf.strings.unicode_split to split the text into characters.
line = "Hello world!"
chars = tf.strings.unicode_split(line, input_encoding = "UTF-8")
print(chars)

tf.Tensor([b'H' b'e' b'l' b'l' b'o' b' ' b'w' b'o' b'r' b'l' b'd' b'!'], shape=(12,), dtype=string)


In [6]:
print(vocab.index('a'))
print(vocab.index('e'))
print(vocab.index('i'))
print(vocab.index('o'))
print(vocab.index('u'))
print(vocab.index(' '))
print(vocab.index('2'))
print(vocab.index('3'))

55
59
63
69
75
3
14
15


In [7]:
# Tensorflow has a function tf.keras.layers.StringLookup that does this efficiently for list of characters.
# Note that the output object is of type tf.Tensor. Here is the result of applying the StringLookup function
# to the characters of "Hello world"

# The mask_token parameter specifies a token that should be considered as a special mask token. This token is usually
# used to indicate padding or a special state in your data.
ids = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)(chars)
print(ids)

tf.Tensor([34 59 66 66 69  3 77 69 72 66 58  4], shape=(12,), dtype=int64)


### line_to_tensor

In [8]:
# akes in a single line and transforms each character into its unicode integer.
# This returns a list of integers, which we'll refer to as a tensor.
def line_to_tensors(line, vocab):
  chars = tf.strings.unicode_split(line, input_encoding = "UTF-8")

  ids = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)(chars)

  return ids

### Function produces text given a numeric tensor

In [9]:
def text_from_ids(ids, vocab):
  # Initialize the StringLookup Layer to map integer IDs back to characters
  chars_from_ids = tf.keras.layers.StringLookup(
      vocabulary = vocab,
      invert = True,
      mask_token = None
  )

  # Use the layer to decode the tensor of IDs into human-readable text
  return tf.strings.reduce_join(chars_from_ids(ids), axis = -1)

In [10]:
text_from_ids(ids, vocab).numpy()

b'Hello world!'

### Prepare data for training and testing

In [11]:
train_lines = lines [:-1000]
eval_lines = lines[-1000:]

print(f"Number of training lines : {len(train_lines)}")
print(f"Number of validation lines : {len(eval_lines)}")

Number of training lines : 5673
Number of validation lines : 1000


### TensorFlow dataset

In [12]:
all_ids = line_to_tensors(
    "\n".join(["Hello world!", "Generative AI"]),
    vocab
)

all_ids

<tf.Tensor: shape=(26,), dtype=int64, numpy=
array([34, 59, 66, 66, 69,  3, 77, 69, 72, 66, 58,  4,  2, 33, 59, 68, 59,
       72, 55, 74, 63, 76, 59,  3, 27, 35])>

In [13]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print([text_from_ids([ids], vocab).numpy() for ids in ids_dataset.take(10)])

[b'H', b'e', b'l', b'l', b'o', b' ', b'w', b'o', b'r', b'l']


In [14]:
# onfigure this dataset to produce batches of the same size each time
seq_length = 10
data_generator = ids_dataset.batch(seq_length + 1, drop_remainder = True)

In [15]:
for seq in data_generator.take(2):
  print(seq)

tf.Tensor([34 59 66 66 69  3 77 69 72 66 58], shape=(11,), dtype=int64)
tf.Tensor([ 4  2 33 59 68 59 72 55 74 63 76], shape=(11,), dtype=int64)


In [16]:
i = 1
for seq in data_generator.take(2):
  print(f"{i}. {text_from_ids(seq, vocab).numpy()}")
  i = i + 1

1. b'Hello world'
2. b'!\nGenerativ'


### Create the input and the output for the model

In [17]:
# The following function creates 2 tensors, each with a length of seq_length out of the input sequence of
# lenght seq_length + 1. The first one contains the first seq_length elements and the second one contains
# the last seq_length elements. For example, if you split the sequence ['H', 'e', 'l', 'l', 'o'], you will
# obtain the sequences ['H', 'e', 'l', 'l'] and ['e', 'l', 'l', 'o'].
def split_input_target(sequence):
  # Create the input sequence by excluding the last char
  input_text = sequence[:-1]

  # Create the target_sequence by excluding the first char
  target_text = sequence[1:]

  return input_text, target_text

In [18]:
# list("tensorflow") - ['t', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w']
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

### Create data_generator

In [24]:
def create_batch_dataset(lines, vocab, seq_length, batch_size = 64):
  # Buffer size to shuffle the dataset
  BUFFER_SIZE = 10000

  # For simplicity, join all lines into a single line
  single_line_data = "\n".join(lines)

  # Convert data into tensor using the given vocab
  all_ids = line_to_tensors(single_line_data, vocab)

  # Create a Tensorflow dataset from the data tensor
  ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

  # Create a batch dataset
  data_generator = ids_dataset.batch(seq_length + 1, drop_remainder = True)

  # Map each input sample using split_input_target function
  dataset_xy = data_generator.map(split_input_target)

  dataset = (
      dataset_xy
      .shuffle(BUFFER_SIZE)
      .batch(batch_size, drop_remainder = True)
      .prefetch(tf.data.experimental.AUTOTUNE)
  )

  return dataset

### Create the training dataset

In [25]:
# Batch size
BATCH_SIZE = 64
dataset = create_batch_dataset(train_lines, vocab, seq_length = 100, batch_size = BATCH_SIZE)

### Defining the GRU Language Model

In [60]:
class GRULM(tf.keras.Model):
  def __init__(self, vocab_size=256, embedding_dim=256, rnn_units=128):
      super().__init__()

      # Create an embedding layer to map token indices to embedding vectors
      self.embedding = tf.keras.layers.Embedding(
          vocab_size, embedding_dim
      )

      # Define a GRU
      self.gru = tf.keras.layers.GRU(
          units = rnn_units,
          # return_sequences=True: The GRU layer returns the output for each time step in the sequence.
          # This is useful when you want the full sequence of outputs, such as for sequence-to-sequence
          # tasks or when stacking multiple RNN layers.
          return_sequences = True,
          # return_sequences=False (default): The GRU layer returns only the output of the last time step.
          # This is useful when you only care about the final state of the sequence, such as in tasks like
          # sequence classification.
          return_state = True
      )

      self.dense = tf.keras.layers.Dense(
          units = vocab_size,
          activation = tf.nn.log_softmax
      )

  def call(self, inputs, states = None, return_state = False, training = False):
      x = inputs
      x = self.embedding(x, training = training)

      if states is None:
        # Get the initial state from the GRU layer
        states = self.gru.get_initial_state(x)

      # initial_state=states passes the previous hidden state (or the initial state if states is None) to the GRU layer.
      # states are updated by the GRU layer and represent the internal state after processing the input sequence.
      x, states = self.gru(x, initial_state = states, training = training)

      # Predict the next tokens and apply log-softmax activation
      x = self.dense(x, training = training)

      if return_state:
        return x, states
      else:
        return x

In [62]:
# Length of the vocabulary in StringLookup layer
vocab_size = 82

# The embedding dimension
embedding_dim = 256

# RNN Layers
rnn_units = 512

model = GRULM(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units
)

model.summary()

In [63]:
for input_example_batch, target_example_batch in dataset.take(1):
  print("Input : ", input_example_batch[0].numpy())
  example_batch_predictions = model(tf.constant([input_example_batch[0].numpy()]))
  print("\n",example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

Input :  [66 58 73 74  3 57 69 68 76 59 72 74 22  2 41 72  3 59 66 73 59  3 69 60
  3 74 62 59 59  3 74 62 63 73  3 35  3 70 72 69 61 68 69 73 74 63 57 55
 74 59  9  2 46 62 79  3 59 68 58  3 63 73  3 74 72 75 74 62  6 73  3 55
 68 58  3 56 59 55 75 74 79  6 73  3 58 69 69 67  3 55 68 58  3 58 55 74
 59 11  2 13]

 (1, 100, 82) # (batch_size, sequence_length, vocab_size)


In [64]:
example_batch_predictions[0][99].numpy()

array([-4.405271 , -4.3934526, -4.392767 , -4.418864 , -4.433049 ,
       -4.3956614, -4.410755 , -4.414542 , -4.4187446, -4.400231 ,
       -4.4112573, -4.416686 , -4.4081264, -4.407482 , -4.3992887,
       -4.4051733, -4.387538 , -4.416606 , -4.42149  , -4.3951254,
       -4.3954253, -4.410253 , -4.39858  , -4.4115577, -4.4029994,
       -4.411835 , -4.3993073, -4.4315295, -4.4048543, -4.402895 ,
       -4.4145036, -4.4188666, -4.416685 , -4.3992705, -4.403578 ,
       -4.4018993, -4.40185  , -4.41815  , -4.4098935, -4.404907 ,
       -4.3953876, -4.3965945, -4.4110174, -4.392286 , -4.404081 ,
       -4.4139047, -4.4209924, -4.4046006, -4.4133706, -4.4039445,
       -4.4066133, -4.4069643, -4.402458 , -4.4143076, -4.4103475,
       -4.401334 , -4.392287 , -4.413392 , -4.397278 , -4.4090652,
       -4.406566 , -4.403904 , -4.404248 , -4.4080486, -4.420021 ,
       -4.394411 , -4.410979 , -4.41105  , -4.406652 , -4.4070473,
       -4.3968554, -4.412951 , -4.4020777, -4.402106 , -4.3893

In [65]:
last_character = tf.math.argmax(
    example_batch_predictions[0][99]
)
print(last_character.numpy())

16


In [66]:
sampled_indices = tf.math.argmax(
    example_batch_predictions[0], axis = 1
)
print(sampled_indices.numpy())

[33  9 49 26 37 39 33 15 15 74 40  5 60 79  3  3 37  3 66 66 66 66 15 39
 39 41 67 66 66 66 41 67 33 66 37 46 39 34  3 15 33 34 15 15 41 33 33  6
 19 66  0 62  2 50  2 37 66 34 13 70 30 76 39 19  3 69 69 63 64 37 37 37
 59 36 39 17 81 36 81 69 62 64 76 37 13 15 15 15 39 36 36 36 39 36 36 53
 81 74 13 16]


In [67]:
print("Input : \n", text_from_ids(input_example_batch[0], vocab))
print()
print("Next char predictions : \n", text_from_ids(sampled_indices, vocab))

Input : 
 tf.Tensor(b"ldst convert:\nOr else of thee this I prognosticate,\nThy end is truth's and beauty's doom and date.\n1", shape=(), dtype=string)

Next char predictions : 
 tf.Tensor(b'G,W?KMG33tN"fy  K llll3MMOmlllOmGlKTMH 3GH33OGG\'7l[UNK]h\nX\nKlH1pDvM7 ooijKKKeJM5[UNK]J[UNK]ohjvK1333MJJJMJJ[[UNK]t14', shape=(), dtype=string)


### Training

### Train model

In [68]:
def compile_model(model):
  # Define the loss function
  loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True)
  # Define Adam optimizer
  opt = tf.keras.optimizers.Adam(learning_rate = 0.00125)
  # Compile the model
  model.compile(optimizer = opt, loss = loss)

  return model

In [69]:
EPOCHS = 10

# Compile the model
model = compile_model(model)
# Fit the model
history = model.fit(dataset, epochs = EPOCHS)

Epoch 1/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - loss: 3.5239
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - loss: 2.4496
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 2s/step - loss: 2.2230
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - loss: 2.0610
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1s/step - loss: 1.9286
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1s/step - loss: 1.8238
Epoch 7/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2s/step - loss: 1.7313
Epoch 8/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 2s/step - loss: 1.6714
Epoch 9/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 2s/step - loss: 1.6029
Epoch 10/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 2s/step - loss: 1.5579


### Evaluation

### log_perplexity

In [74]:
def log_perplexity(preds, target):
  PADDING_ID = 1
  # Calculate the log probabilities for predictions using one-hot encoding
  log_p = np.sum(preds * tf.one_hot(target, depth = preds.shape[-1]), axis = -1)

  # Identify non-padding elements in the target
  non_pad = 1.0 - np.equal(PADDING_ID, target)

  # Apply non-padding mask to log probabilities to exclude padding
  log_p = log_p * non_pad

  # Calculate the log perplexity by taking the sum of log probabilities and dividing by the sum of non-padding
  log_ppx = np.sum(log_p, axis = -1)/ np.sum(non_pad, axis = -1)

  # Compute the mean of log perplexity
  log_ppx = -1 * np.mean(log_ppx)

  return log_ppx


In [None]:
eval_text = "\n".join(eval_lines)
eval_ids = line_to_tensors([eval_text], vocab)
input_ids, target_ids = split_input_target(tf.squeeze(eval_ids, axis = 0))

pred, status = model(tf.expand_dims(input_ids,0), training = False, states = None, return_state = True)

# Get the log perplexity
log_ppx = log_perplexity(pred, tf.expand_dims(target_ids, 0))
print(f"The log perplexity and perplexity of the model are {log_ppx} and {np.exp(log_ppx)} respectively")