### Create a character level LM

- The content might not make sense
- It's just a code template 


reference: 
https://www.tensorflow.org/text/tutorials/text_generation

In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
### Download dataset

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
### Read dataset

# Read, then decode for py2 compat.

text = open(path_to_file,'rb').read().decode(encoding = 'utf-8')

# length of text is the number of characters in it

print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [4]:
### Take a look at first 250 characters

print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
### Number of unique characters 

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

65 unique characters


### Text Preprocessing

- vectorize text: convert string to numerical representation
- The tf.keras.layers.StringLookup layer can convert each character into a numeric ID. It just needs the text to be split into tokens first.

In [6]:
### split string into list
### convert to numeric

example_texts = ['abcdefg','xyz']
chars = tf.strings.unicode_split(example_texts,input_encoding = 'UTF-8')
chars 

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [7]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary = list(vocab),mask_token = None)

### check example 

ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>

### Convert numeric back to char

- invert:  invert this representation and recover human-readable strings
-  get_vocabulary() method of the tf.keras.layers.StringLookup layer so that the [UNK] tokens is set the same way


- tf.strings.reduce_join: join char back to string


In [8]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary = ids_from_chars.get_vocabulary(), invert = True, mask_token = None)

chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [9]:
### join back to string

tf.strings.reduce_join(chars,axis = 1).numpy()



array([b'abcdefg', b'xyz'], dtype=object)

In [10]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids),axis = -1)


### Create training data

- next word prediction
- right shift one char as output

- Use batch method with tf.data.Dataset.from_tensor_slices to create sequences of desired size

In [11]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids


### convert the text vector into a stream of character indices.
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [24]:
seq_length = 100

## 資料合併成一個個batch, 長度為seq_length+1

sequences = ids_dataset.batch(seq_length+1,drop_remainder = True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [13]:
## easier to see what this is doing if you join the tokens back into strings

for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


### Training pairs (start from here)
- For training you'll need a dataset of (input, label) pairs. Where input and label are sequences. At each time step the input is the current character and the label is the next character.

In [14]:
def split_input_target(sequence):
    
    input_text = sequence[:-1]
    target_text = sequence[1:]
    
    return input_text, target_text

split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [25]:
dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(3):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())
    
    

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
Input : b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
Target: b're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
Input : b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us k"
Target: b"ow Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"


### Create training batches

In [27]:
## Batch size
BATCH_SIZE =64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).

BUFFER_SIZE = 10000

dataset = (
    dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None, 100), dtype=tf.int64, name=None))>

### Build Model

##### keras.Model subclass reference: https://www.tensorflow.org/guide/keras/custom_layers_and_models 

- Embedding: The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;
- GRU: A type of RNN with size units=rnn_units (You can also use an LSTM layer here.)
- Dense: The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model

In [31]:
# Length of the vocabulary in StringLookup Layer

vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units=1024

In [32]:
class MyModel(tf.keras.Model):
    
    def __init__(self,vocab_size,embedding_dim,rnn_units):
        super().__init__(self)
        
        ### Define layers
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                      return_sequences = True,
                                      return_state = True)
        
        self.dense = tf.keras.layers.Dense(vocab_size)
        
        
    def call(self, inputs, states = None, return_state = False,training = False):
        x = inputs
        x = self.embedding(x,training=training)
        
        if states is None:
            states = self.gru.get_initial_state(x)
            
        x,states = self.gru(x,initial_state = states,training = training)
        x = self.dense(x,training=training)
        
        if return_state:
            return x,states
        else:
            return x

In [33]:
model = MyModel(
        vocab_size = vocab_size,
        embedding_dim = embedding_dim,
        rnn_units = rnn_units)

### Try the model

In [34]:
### check the output shape

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [36]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


#### Note: It is important to sample from this distribution as taking the argmax of the distribution can easily get the model stuck in a loop.

In [39]:
sampled_indices = tf.random.categorical(example_batch_predictions[0],num_samples = 1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

sampled_indices

array([53, 11, 46,  4, 45, 47,  4, 24, 26, 47, 48, 34,  1,  4, 28,  4, 54,
       11, 32, 33, 38, 59,  7, 41, 10, 38, 50, 29,  4, 11,  0, 54, 43,  0,
       27, 18, 52, 41,  5, 18, 45, 48, 65, 43, 27, 34, 49, 20, 58, 28, 19,
       55, 25, 54, 23, 36, 30, 50, 27, 53, 21, 29, 39, 58,  4, 18, 31, 25,
        2,  3, 61,  3,  9,  4, 45, 54, 55, 55, 20,  4, 47, 21, 31, 59, 50,
       25, 46, 39, 20,  6, 17, 54, 36, 63, 53,  8, 52, 49, 44, 10],
      dtype=int64)

In [46]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Prediction:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'aying he would make his son\nHeir to the crown; meaning indeed his house,\nWhich, by the sign thereof '

Next Char Prediction:
 b"n:g$fh$KMhiU\n$O$o:STYt,b3YkP$:[UNK]od[UNK]NEmb&EfizdNUjGsOFpLoJWQkNnHPZs$ERL !v!.$foppG$hHRtkLgZG'DoWxn-mje3"


### Training

- A newly initialized model shouldn't be too sure of itself, the output logits should all have similar magnitudes. To confirm this you can check that the exponential of the mean loss is approximately equal to the vocabulary size. A much higher loss means the model is sure of its wrong answers, and is badly initialized:

In [47]:
## Because your model returns logits, you need to set the from_logits flag.
loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True)


## check loss before training
example_batch_mean_loss = loss(target_example_batch,example_batch_predictions)

print("Prediction shape:", example_batch_predictions.shape,"(batch_size,sequence_length,vocab_size)")
print("Mean loss:       ",example_batch_mean_loss)

Prediction shape: (64, 100, 66) (batch_size,sequence_length,vocab_size)
Mean loss:        tf.Tensor(4.189708, shape=(), dtype=float32)


In [48]:
tf.exp(example_batch_mean_loss).numpy()

66.00353

In [49]:
model.compile(optimizer = 'adam',loss = loss)

#### Configure checkpoints
- Use a tf.keras.callbacks.ModelCheckpoint to ensure that checkpoints are saved during training:

In [50]:
# Directory where the checkpoints will be saved

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,"ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

In [51]:
### Execute the training

EPOCHS = 20

history = model.fit(dataset,epochs=EPOCHS,callbacks = [checkpoint_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Generate text

- tf.random.categorical：https://blog.csdn.net/menghuanshen/article/details/105356239

In [55]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        # logits: 形状为 [batch_size, num_classes]的张量. 每个切片 [i, :]代表对于所有类的未正规化的log概率。
        # num_samples: 0维，从每一行切片中抽取的独立样本的数量。
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [56]:
one_step_model = OneStep(model,chars_from_ids,ids_from_chars)

### Test result

In [61]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
Why, vouchy is most government?
If I did fleezan, that the king
Shall be my queen, and let him all apart.

BIANCA:
Ay, marry, goes with me.

Justice:
By the hollivest man, should seem again to my soul.
You'll marry us to old right? What art thou
That will not think the heart that there die, what blows?

CLARENCE:
Thou hast provoked to hang them nothing but a Coriolanus
Had propice my honest more rich than any man divired his else
But doubt not that the King of Halp's--
May not appear, and gall people,
To save your loving trimingly ripes in gentle, betrause
Hurts you of roop and married; else, my father, in my birth,
And the most predrops of them, for there
it is now reply. But, sup out with the direst devier
Thou hast possession that wanter down, her eye
Than a little office with an our state
And liberty be thou disposes of fear.

TRANIO:
Shalt thou be made, hear me in my throat,
boys: beholder, his friends: if you whit end o'erthem
And hell encounter air usur sin.

AUTOLYCUS:
O

### Save for further

In [64]:
##tf.saved_model.save(obj, export_dir, signatures=None, options=None)

tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')





INFO:tensorflow:Assets written to: one_step\assets


INFO:tensorflow:Assets written to: one_step\assets


In [65]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
    result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

ROMEO:
My lord perchio.

PETRUCHIO:
Well, go with me this is my son. Who should say the circumness
We shal
