In [31]:
import tensorflow as tf
import numpy as np
import os
import time

In [32]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [33]:
# 바이너리 읽기
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print(f'Len of text: {len(text)} chars')

Len of text: 1115394 chars


In [34]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [35]:
# 파일 내 유니크한 문자들
vocab = sorted(set(text))
print(f'The unique characters in the file : {len(vocab)}')

The unique characters in the file : 65


In [36]:
# StringLoopUp -> 문자열 feature를 정수 인덱스에 매핑해주는 전처리 레이어
# mask token -> 마스킹된 input을 나타내는 토큰. output mode가 int일때만 vocabulary에 포함되며 index는 0임
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [37]:
example_texts = ['practice', 'rnn']
chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')  # UTF-8 소문자로 적으면 에러남

In [38]:
chars

<tf.RaggedTensor [[b'p', b'r', b'a', b'c', b't', b'i', b'c', b'e'], [b'r', b'n', b'n']]>

In [39]:
ids = ids_from_chars(chars)

In [40]:
ids

<tf.RaggedTensor [[55, 57, 40, 42, 59, 48, 42, 44], [57, 53, 53]]>

In [41]:
tf.strings.reduce_join(chars, axis=1).numpy()

array([b'practice', b'rnn'], dtype=object)

In [42]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [43]:
# 예측 (Prediction)
# Training examples과 targets 생성
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1], dtype=int64)>

In [44]:
# 텍스트 벡터를 문자 인덱스의 스트림으로 변환
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [45]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [46]:
# batch
seq_length = 100
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

for seq in sequences.take(5):
    print(chars_from_ids(seq))
    print(text_from_ids(seq).numpy())


tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[b'a' b'r' b'e' b' ' b'a' b'l' b'l' b' ' b'r' b'e' b's' b'o' b'l' b'v'
 b'e' b'd' b' ' b'r' b'a' b't' b'h' b'e' b'r' b' ' b't' b'o' b' ' b'd'
 b'i' b'e' b' ' b't' b'h' b'a' b'n' b' ' b't' b'o' b' ' b'f' b'a' b'm'
 b'i' b's' b'h' b'?' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'R' b'e' b's'
 b'o' b'l' b'v' b'e' b'd' b'.

In [47]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]

    return input_text, target_text

In [48]:
split_input_target(list('Tensorflow'))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [49]:
dataset = sequences.map(split_input_target)

In [50]:
for input_example, target_example in dataset.take(1):
    print('Input : ', text_from_ids(input_example).numpy())
    print('Target : ', text_from_ids(target_example).numpy())

Input :  b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target :  b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [51]:
# training batches 생성
# 위에서 tf.data를 이용해서 텍스트를 manageable sequence로 변환함
# 여기서는 데이터를 모델에 넣기 전에 섞고 batch를 적용할 것

# Batch size = 64
BATCH_SIZE = 64
# Buffer size -> shuffle dataset
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [52]:
# 모델 빌드
# 이 모델은 3개의 레이어로 구성됨
# 1. tf.keras.layers.Embedding : The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;
# 2. tf.keras.layers.GRU : GRU
# 3. tf.keras.layers.Dense : The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

# Length of the vocabulary in StringLookup layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# RNN 유닛 수
rnn_units = 1024


In [53]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x


In [54]:
model = MyModel(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units)

![screensh](./text_generation_training.png)

In [55]:
# Try the model

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')

(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [56]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  16896     
                                                                 
 gru_1 (GRU)                 multiple                  3938304   
                                                                 
 dense_1 (Dense)             multiple                  67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [57]:
# first example in the batch
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [58]:
sampled_indices

array([53, 43, 39, 56, 63, 17, 56, 19, 17, 57, 48, 58, 35, 45, 34, 22,  8,
       55, 57, 42, 43, 30, 56,  6, 59, 50, 18, 51, 53, 30, 44, 38, 20, 42,
       31, 57, 30, 55, 55, 32, 44,  2, 20, 20, 45, 28, 41, 13,  6, 38,  3,
       65, 12, 54, 45,  8,  8, 52, 40, 26, 48, 46, 33, 11, 57, 44, 26,  0,
       45, 18, 27, 28, 45, 61, 23, 42, 21, 52, 25,  3, 31, 56, 27, 13, 26,
       15, 40, 33, 40, 48, 41, 44, 37, 65, 17, 46, 35, 43, 31,  6],
      dtype=int64)

In [59]:
# Decode
print('Input:\n', text_from_ids(input_example_batch[0]).numpy())
print()
print('Next Char Predictions:\n', text_from_ids(sampled_indices).numpy())

Input:
 b'S:\nI know you are now, sir, a gentleman born.\n\nClown:\nAy, and have been so any time these four hours'

Next Char Predictions:
 b"ndZqxDqFDrisVfUI-prcdQq'tkElnQeYGcRrQppSe GGfOb?'Y!z;of--maMigT:reM[UNK]fENOfvJcHmL!RqN?MBaTaibeXzDgVdR'"


In [60]:
# Train
# optimizer와 loss func 적용

# loss function, 모델이 logits을 return하기때문에 from_logits를 True로 줌
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [61]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print('Prediction shape: ', example_batch_predictions.shape, ' # (batch_size, sequence_length, vocab_size)')
print('Mean loss:        ', example_batch_mean_loss)

Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.189608, shape=(), dtype=float32)


In [62]:
# 새롭게 initialized된 모델은 자체적으로 너무 확신할 수 없고, 
# output logits들의 크기는 모두 비슷해야한다. 
# 위의 사항을 확인하기 위해서, 
# mean loss에 exp를 씌운 값이 ocabulary size와 거의 같은지를 확인
# loss가 높다 -> 모델이 오답을 정답이라고 확신하고 잘못 초기화되었음을 의미
tf.exp(example_batch_mean_loss).numpy()

65.99692

In [63]:
model.compile(optimizer='adam', loss=loss)

In [64]:
# 체크포인트 설정

# 저장 경로
checkpoint_dir = './training_checkpoints'

# 이름
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [65]:
# 학습 실행

# EPOCHS
EPOCHS = 20

In [67]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Generate text

# 이 모델을 사용하여 텍스트를 생성하는 가장 간단한 방법
# 반복적으로 모델을 실행할 때 모델의 내부 상태를 추적하는 것입니다.

![screensh](./text_generation_sampling.png)

In [75]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # 생성 과정에서 UNK를 막기 위해 마스크 생성
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # 각각의 bad index에 -무한대 집어넣음
            values = [-float('inf')] * len(skip_ids),
            indices=skip_ids,
            dense_shape = [len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # 문자열을 토큰 ids로 변환
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # 모델 실행
        # predicted_logits.shape은 [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states, return_state=True)                                     
            
        # 마지막 prediction만 사용
        predicted_logits = predicted_logits[:, -1, :]

        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits/self.temperature
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # token ids를 문자로 변환
        predicted_chars = self.chars_from_ids(predicted_ids)

        # 문자와 model state를 return
        return predicted_chars, states

In [76]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [77]:
# 텍스트 생성을 위한 반복문 실행
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_' * 80)
print('\nRun time:', end - start)

ROMEO:
My tell-work, sirs, let's see these burchers of my sweet tongue
And fear on thee, for I should knock
Sound soul: my harm it is. There is no brother; so long
I--rugune of generus pity to my simplishment:
Or I my love, among your love pronount by
their beadseres. What doth he married the command
And set thy choice be cured sons,
But yet my mind that Richmond in their grave.

LADY GREY:
'Tis but your lord cheek to me; for I throw at a
piece of invented man it is so much swears,
Whomily they shall proceed.

ISABELLA:
So sleep these tedious,
As if thou slew her brothers and how cannot
Be for determinent?

CATESBY:
My lord?

PAULINA:
Answer it, most ownards?
I will tell her as the found music stabs,
Who was whether 'twas done canter in thee,
Half-yard, wither'd rust, and lovers' territories.
Now, soft! say'st thou forth ma-king colours!
With old correction caves a brave death of
Bealina.

GRUMIO:
I am a merran be,
Do lean-the skate, of what is my presence,
must so his unjust;
Had thy 

In [78]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)


tf.Tensor(
[b"ROMEO:\nThere is not made.\n\nPETRUCHIO:\nVirtue it,--\n\nPAULINA:\nAnd, save your son\nWill this mis-dastly title's vengeance?\n\nBAPTISTA:\nFinest thou, or drum no more forthwith some Hastings, you\nshall prosper-heart me now; something throat,\nAnd vick for your cousin Ruture? what compla?\n\nBIOH:\nHad I the truth, I say.\n\nGLOUCESTER:\nI would say that is mine own to myself.\nI come to you and hereaford spinks we forege\nMy saken's old hate mummer'd. Did you laugh\nHis living woe; when he stabe't?\nOpen thy best mach! is this well? perchance, be?\n\nFirst Servingman:\nWhy, very well! the duke.\n\nLEONTES:\nOn yonder count!\n\nFirst Lord:\nWhy shall return for colscears, Queen.\n\nKING EDWARD IV:\nBut, being alone, something goodly benefit\nWhich hence to take this wrankling sun.\n\nPAULI:\nThou'rt took my daughter: I am rone, I must alone with him.\n\nHORTENSIO:\nMarcius, here, that I mean not on thy words appears:\nANd now, dy presume, beshipp at the queen's;\nO, b