# 16.1 Char-RNN을 사용해 셰익스피어 문체 생성하기

## 16.1.1 훈련 데이터셋 만들기

In [None]:
# 책에 없지만 정상 동작을 위해 코드 추가
import numpy as np
import tensorflow as tf
from tensorflow import keras

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)

with open(filepath) as f :
  shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [None]:
max_id = len(tokenizer.word_index) # 고유 글자 개수
dataset_size = tokenizer.document_count # 전체 글자 개수

In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

## 16.1.2 순차 데이터셋을 나누는 방법

In [None]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

## 16.1.3 순차 데이터를 윈도 여러 개로 자르기

In [None]:
n_steps = 100
window_length = n_steps + 1 # target = 1글자 앞의 input
dataset = dataset.window(window_length, shift = 1, drop_remainder = True)

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [None]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth = max_id), Y_batch)
)

In [None]:
dataset = dataset.prefetch(1)

# 16.1.4 Char-RNN 모델 만들고 훈련하기

In [None]:
model = keras.models.Sequential([
                                 keras.layers.GRU(128, return_sequences = True, input_shape = [None, max_id],
                                                  dropout = 0.2, recurrent_dropout = 0.2),
                                 keras.layers.GRU(128, return_sequences = True,
                                                  dropout = 0.2, recurrent_dropout = 0.2),
                                 keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation = "softmax"))
])
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam")

In [None]:
history = model.fit(dataset, epochs = 1) # epochs = 20 -> 1 로 수정 (회차당 약 15,500초 소요)



## 16.1.5 Char-RNN 모델 사용하기

In [None]:
def preprocess(texts) :
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 첫번째 문장, 마지막 글자

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'u'

In [None]:
def next_char(text, temperature = 1) :
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]

  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples = 1) + 1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
def complete_text(text, n_chars = 50, temperature = 1):
  for _ in range(n_chars) :
    text += next_char(text, temperature)
  return text

In [None]:
print(complete_text("t", temperature = 0.2))

the time in the words.

grumio:
what, sir, i shall 


In [None]:
print(complete_text("t", temperature = 1))

t i came by girl! why, sir; all he is the our end m


In [None]:
print(complete_text("t", temperature = 2))

t-two, ho,? soft this vert,
you swentmed sbobbitati


## 16.1.7 상태가 있는 RNN

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift = n_steps, drop_remainder = True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)

In [None]:
# 정상동작을 위해 코드 추가
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts :
  dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
  dataset = dataset.window(window_length, shift = n_steps, drop_remainder = True)
  dataset = dataset.flat_map(lambda window: window.batch(window_length))
  datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))

In [None]:
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth = max_id), Y_batch))
dataset = dataset.prefetch(1)

In [None]:
model = keras.models.Sequential([
                                 keras.layers.GRU(128, return_sequences = True, stateful = True,
                                                  dropout = 0.2, recurrent_dropout = 0.2,
                                                  batch_input_shape = [batch_size, None, max_id]),
                                 keras.layers.GRU(128, return_sequences = True, stateful = True,
                                                  dropout = 0.2, recurrent_dropout = 0.2),
                                 keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation = "softmax"))
])

In [None]:
class ResetStatesCallback(keras.callbacks.Callback) :
  def on_epoch_begin(self, epoch, logs) :
    self.model.reset_states()

In [None]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam")
steps_per_epoch = 5 # 책에 없지만 코드 정상 동작을 위해 추가
model.fit(dataset, steps_per_epoch = steps_per_epoch, epochs = 10, callbacks = [ResetStatesCallback()]) # 코랩에서 코드 수행을위해 코드 수정 epochs = 50 -> 10

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8891355b38>

# 16.2 감성 분석

In [None]:
(X_train, y_test), (X_valid, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [None]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

for id_, token in enumerate(("<pad>", "<sos>", "<unk>")) :
  id_to_word[id_] = token

" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised = True, with_info = True)
train_size = info.splits["train"].num_examples

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteKYXL7S/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteKYXL7S/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteKYXL7S/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
def preprocess(X_batch, y_batch) :
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value = b"<pad>"), y_batch

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess) :
  for review in X_batch :
    vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [None]:
vocab_size = 10000
truncated_vocabulary = [
                        word for word, count in vocabulary.most_common()[:vocab_size]
]

In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [None]:
def encode_words(X_batch, y_batch) :
  return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
embed_size = 128
model = keras.models.Sequential([
                                 keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                                                        input_shape = [None]),
                                 keras.layers.GRU(128, return_sequences = True),
                                 keras.layers.GRU(128),
                                 keras.layers.Dense(1, activation = "sigmoid")
])

model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
history = model.fit(train_set, epochs = 5) # 건당 약 130초 소요

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## 16.2.1 마스킹 

In [None]:
K = keras.backend
inputs = keras.layers.Input(shape = [None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)

z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences = True)(z, mask = mask)
z = keras.layers.GRU(128)(z, mask = mask)

outputs = keras.layers.Dense(1, activation = "sigmoid")(z)
model = keras.models.Model(inputs = [inputs], outputs = [outputs])

## 16.2.2 사전 훈련된 임베딩 재사용하기

In [None]:
import tensorflow_hub as hub

model = keras.Sequential([
                          hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                                         dtype = tf.string, input_shape = [], output_shape = [50]),
                          keras.layers.Dense(128, activation = "relu"),
                          keras.layers.Dense(1,   activation = "sigmoid")
])

model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [None]:
datasets, info = tfds.load("imdb_reviews", as_supervised = True, with_info = True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)
history = model.fit(train_set, epochs = 5) # 건당 약 60초 소요

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# 16.3 신경망 기계 번역을 위한 인코더-디코더 네트워크

In [None]:
# 책에 없지만 코드 정상 동작을 위해 코드 추가
vocab_size = 100
embed_size = 10

In [None]:
import tensorflow_addons as tfa

encoder_inputs   = keras.layers.Input(shape = [None], dtype = np.int32)
decoder_inputs   = keras.layers.Input(shape = [None], dtype = np.int32)
sequence_lengths = keras.layers.Input(shape = [],     dtype = np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(32, return_state = True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(32)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer = output_layer)

final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state = encoder_state,
    sequence_length = sequence_lengths
)

Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.models.Model(
    inputs = [encoder_inputs, decoder_inputs, sequence_lengths],
    outputs = [Y_proba]
)

## 16.3.1 양방향 RNN

In [None]:
keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences = True))

<tensorflow.python.keras.layers.wrappers.Bidirectional at 0x7f35f2484cf8>

## 16.3.2 빔 검색

In [None]:
beam_width = 10
decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder(
    cell = decoder_cell, beam_width = beam_width, output_layer = output_layer)

decoder_initial_state = tfa.seq2seq.beam_search_decoder.tile_batch(
    encoder_state, multiplier = beam_width
)

In [None]:
outputs, _, _ = decoder(
    decoder_embeddings, start_tokens = [0], end_token = 0, # 코드 수행을 위해 embeddings_decoder -> decoder_embeddings 으로 변경
    initial_state = decoder_initial_state
)

ValueError: ignored

# 16.4 어텐션 메커니즘

In [None]:
# 책에 없지만 정상 동작을 위해 코드 추가
units = 128
encoder_state = None
encoder_sequence_length = None

decoder_cell = keras.layers.LSTMCell(units)
attention_mechanism = 32
n_units = None

In [None]:
attention_mechanism = tfa.seq2seq.attention_wrapper.LuongAttention(
    units, encoder_state, memory_sequence_length = encoder_sequence_length
)

attention_decoder_cell = tfa.seq2seq.attention_wrapper.AttentionWrapper(
    decoder_cell, attention_mechanism, attention_layer_size = n_units
)

## 16.4.2 트랜스포머 구조: 어텐션이 필요한 전부다

### 위치 인코딩

In [None]:
class PositionalEncoding(keras.layers.Layer) :
  def __init__(self, max_steps, max_dims, dtype = tf.float32, **kwargs) :
    super().__init__(dtype = dtype, **kwargs)

    if max_dims % 2 == 1: max_dims += 1 # max_dims는 짝수여야 합니다.
    p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))

    pos_emb = np.empty((1, max_stpes, max_dims))
    pos_emb[0, :, ::2]  = np.sin(p / 10000 ** (2 * i / max_dims)).T
    pos_emb[0, :, 1::2] = np.cos(p / 10000 ** (2 * i / max_dims)).T
    self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))

  def call(self, inputs) :
    shape = tf.shape(inputs)
    return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

In [None]:
embed_size = 512; max_steps = 500; vocab_size = 10000
encoder_inputs = keras.layers.Input(shape = [None], dtype = np.int32)
decoder_inputs = keras.layers.Input(shape = [None], dtype = np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

positional_encoding = PositionalEncoding(max_steps, max_dims = embed_size)
encoder_in = positional_encoding(encoder_embeddings)
decoder_in = positional_encoding(decoder_embeddings)

### 멀티-헤드 어텐션

In [None]:
Z = encoder_in

for N in range(6) :
  Z = keras.layers.Attention(use_scale = True)([Z, Z])

encoder_outputs = Z
Z = decoder_in
for N in range(6) :
  Z = keras.layers.Attention(use_scale = True, causal = True)([Z, Z])
  Z = keras.layers.Attention(use_scale = True)([Z, encoder_outputs])

outputs = keras.layers.TimeDistributed(
  keras.layers.Dense(vocab_size, activation = "softmax"))(Z)