<a href="https://colab.research.google.com/github/yiwenwangANU/Machine_Learning/blob/main/Text_Generation_with_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#loading data

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
with open(path_to_file, 'rb') as f:
  raw_data = f.read().decode('utf-8')

In [4]:
raw_data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

#Preprocess the data

In [5]:
vocab = set(raw_data)
print(vocab)

{"'", 'd', 'c', 'e', 'j', '-', 'w', 'n', '3', 'f', 'O', 'E', 'p', 'Q', 'm', 'v', ':', '\n', 'S', 'R', 't', 'Y', 'k', ',', 'x', 'i', 'B', 'K', '$', 'b', 'G', 's', '&', 'W', ';', '?', 'X', 'z', 'h', 'J', 'u', 'C', 'M', '!', 'U', 'D', ' ', 'V', 'F', 'l', 'I', 'N', 'H', 'g', 'r', 'q', 'y', 'A', 'P', 'a', 'Z', 'o', 'T', '.', 'L'}


In [6]:
all_chars = tf.strings.unicode_split(raw_data, input_encoding='UTF-8') # 'abc' -> tensor ['a', 'b', 'c']
all_chars

<tf.Tensor: shape=(1115394,), dtype=string, numpy=array([b'F', b'i', b'r', ..., b'g', b'.', b'\n'], dtype=object)>

In [7]:
encoder = layers.StringLookup(vocabulary=list(vocab)) # tensor ['a', 'b', 'c'] -> tensor [1, 2, 3]

In [8]:
decoder = layers.StringLookup(vocabulary=encoder.get_vocabulary(), invert=True)

In [9]:
print(encoder.get_vocabulary())

['[UNK]', "'", 'd', 'c', 'e', 'j', '-', 'w', 'n', '3', 'f', 'O', 'E', 'p', 'Q', 'm', 'v', ':', '\n', 'S', 'R', 't', 'Y', 'k', ',', 'x', 'i', 'B', 'K', '$', 'b', 'G', 's', '&', 'W', ';', '?', 'X', 'z', 'h', 'J', 'u', 'C', 'M', '!', 'U', 'D', ' ', 'V', 'F', 'l', 'I', 'N', 'H', 'g', 'r', 'q', 'y', 'A', 'P', 'a', 'Z', 'o', 'T', '.', 'L']


In [10]:
all_ids = encoder(all_chars)
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([49, 26, 55, ..., 54, 64, 18])>

In [11]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [12]:
seq_length = 100
batched_dataset = ids_dataset.batch(seq_length+1, drop_remainder=True)
batched_dataset

<BatchDataset element_spec=TensorSpec(shape=(101,), dtype=tf.int64, name=None)>

In [13]:
def input_target_split(sequence):
  return sequence[:-1], sequence[1:]

input_target_split('12345')

('1234', '2345')

In [14]:
dataset = batched_dataset.map(input_target_split)
dataset

<MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int64, name=None), TensorSpec(shape=(100,), dtype=tf.int64, name=None))>

In [15]:
for inputs, target in dataset.take(1):
  print(f'Input: {tf.strings.reduce_join(decoder(inputs))}')
  print(f'Target: {tf.strings.reduce_join(decoder(target))}')

Input: b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [16]:
dataset = dataset.batch(64).prefetch(tf.data.AUTOTUNE)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None, 100), dtype=tf.int64, name=None))>

In [17]:
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None, 100), dtype=tf.int64, name=None))>

#Bulid the model

In [18]:
vocab_size = len(encoder.get_vocabulary())
embedding_dim = 256
rnn_units = 1024

In [19]:
class Model_0(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super(Model_0, self).__init__()
        self.embed = layers.Embedding(input_dim=vocab_size,
                                      output_dim=embedding_dim)
        self.gru = layers.GRU(units=rnn_units, 
                              return_sequences=True,
                              return_state=True)
        self.dense = layers.Dense(vocab_size)

    def call(self, inputs, return_state=False, state=None):
        x = self.embed(inputs)
        if(state==None):
          state=self.gru.get_initial_state(x)
        
        x, state = self.gru(x, initial_state=state)
        output = self.dense(x)

        if(return_state==True): 
          return output, state
        else:
          return output

In [20]:
model = Model_0(vocab_size, embedding_dim, rnn_units)

In [21]:
for input_sample, _ in dataset.take(1):
  print(tf.strings.reduce_join(decoder(input_sample[0])))
  sample = tf.squeeze(tf.random.categorical(model(input_sample)[0], 1))
  print(tf.strings.reduce_join(decoder(sample)))

tf.Tensor(b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou', shape=(), dtype=string)
tf.Tensor(b"AalXa-sA-o lG?k$EZVtge.o!MKQ-mJWflV[UNK]-oOadpM.pn;\n[UNK]aB,tMeVc-R,LTOMlwmhgI'c:VN!z,s\n[UNK]mNlLqTWGhu:E'FX-x[UNK]U", shape=(), dtype=string)


In [22]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam())

In [None]:
history=model.fit(dataset,
                  epochs=20)

In [92]:
def predict_one_step(model=model, inputs='ROMEO:', state=None):
  inputs = tf.expand_dims(  #(1, seq_len)
      encoder(  #(seq_len)
          tf.strings.unicode_split(inputs, input_encoding='UTF-8')), #(seq_len,)
                          axis=0)

  pred, state = model(inputs, return_state=True, state=state) #(1, seq_len, vocab)
  pred = tf.random.categorical( #(seq_len, 1)
      tf.squeeze(pred, axis=[0]), #(seq_len, vocab)
      1) 
  pred = tf.squeeze( #()
      decoder(  #(1)
              tf.squeeze(pred, axis=[-1])[-1:])) #(seq_len) (1)
  return pred, state

predict_one_step(inputs=':')

(<tf.Tensor: shape=(), dtype=string, numpy=b'\n'>,
 <tf.Tensor: shape=(1, 1024), dtype=float32, numpy=
 array([[ 0.10679512,  0.6995838 , -0.72328156, ..., -0.37781325,
          0.36539927,  0.9411402 ]], dtype=float32)>)

In [95]:
def make_prediction(steps=1000, ininial_inputs='ROMEO:'):
  state=None
  next_one = ininial_inputs
  output = [next_one]
  for i in range(steps):
    next_one, state = predict_one_step(model=model, inputs=next_one, state=state)
    output.append(next_one)
  return tf.strings.reduce_join(output)

preds = make_prediction()

In [97]:
print(preds.numpy().decode('utf-8'))

ROMEO:
Know, sir, 'this meanure's tail and old Citizen:
It was the duke's nurs? Your father hath heard
As he bears himself to make her tears, and such as I swear.

MENENIUS:
Romeo sly?

BAPTISTA:
Marry, sir: she you misthrift or Towards him;
For that I might it yourself; for, surely,
Trade immortafts, believe it,
And herer Vorting. O, he prayed before, father, ho!

KATHARINA:
For thy love dast: here, sir; heck,
But such a stumbling backful man.

PROSPERO:
Thus fly elesced sly.

GREMIO:
Farewell.

BIONDELLO:
What shame? and how my hearts! cheerly?

wRONH:
Peace, tirm! here let him sleeping withal.
Tell me, modest mine unholy watches.

PROSPERO:
She's even see her forth to pierce itself.
Help Sister and his love unto it.

KATHARINA:
The seay of man.

RUKE VINCENTIO:
Thy son is this Pidisch'd it cleak--
The vessen begin of misery;'s shame,--go,
Nor the best friend, it hath sent
forth of change purpose.

PROSPERO:
Thou art a villain. When, Camillo, Most nobles: he is myself: but you have s