### 16.3.2 두 번째 프로젝트: 텐서플로로 글자 단위 언어 모델 구현

In [1]:
# 데이터 다운로드
!wget https://raw.githubusercontent.com/rickiepark/python-machine-learning-book-3rd-edition/master/ch16/1268-0.txt

--2022-04-28 17:39:01--  https://raw.githubusercontent.com/rickiepark/python-machine-learning-book-3rd-edition/master/ch16/1268-0.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1171600 (1.1M) [text/plain]
Saving to: ‘1268-0.txt.1’


2022-04-28 17:39:01 (18.0 MB/s) - ‘1268-0.txt.1’ saved [1171600/1171600]



In [2]:
# 텍스트 읽어 들이기
import numpy as np

with open('1268-0.txt', 'r', encoding='UTF8') as fp:
    text=fp.read()
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)
print('전체 길이: ', len(text))
print('고유한 문자: ', len(char_set))

전체 길이:  1112350
고유한 문자:  80


In [3]:
# 문자를 정수로 정수를 문자로 매핑
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)
print('인코딩된 텍스트 크기: ', text_encoded.shape)

인코딩된 텍스트 크기:  (1112350,)


In [4]:
print(text[:15], '   --> 인코딩 --> ', text_encoded[:15])
print(text_encoded[15:21], '   --> 디코딩 --> ', ''.join(char_array[text_encoded[15:21]]))

THE MYSTERIOUS     --> 인코딩 -->  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]    --> 디코딩 -->  ISLAND


In [5]:
# 인코딩된 텍스트 데이터로 텐서플로 데이터셋 만들기
import tensorflow as tf
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)
for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


2022-04-28 17:39:05.864320: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# batch() - 41개의 문자로 구성된 텍스트 조각을 만들기
# x - [0:40], y - [1:40]
seq_length = 40
chunk_size = seq_length + 1
ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True)

# x & y 나누기 위한 함수 정의
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq
ds_sequences = ds_chunks.map(split_input_target)

In [7]:
for example in ds_sequences.take(2):
    print('입력 (x): ', repr(''.join(char_array[example[0].numpy()])))
    print('타깃 (y): ', repr(''.join(char_array[example[1].numpy()])))
    print()

입력 (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
타깃 (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

입력 (x):  ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
타깃 (y):  'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'



2022-04-28 17:39:05.954364: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


In [8]:
# 미니배치로 나누기
BATCH_SIZE = 64
BUFFER_SIZE = 10000
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [9]:
# 문자 수준의 RNN 모델 만들기
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [10]:
# 매개변수 설정
charset_size = len(char_array)
embedding_dim = 256
rnn_units = 512
tf.random.set_seed(1)
model = build_model(
    vocab_size=charset_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         20480     
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1574912   
_________________________________________________________________
dense (Dense)                (None, None, 80)          41040     
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [11]:
# 컴파일 - 훈련
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True
                  ))
model.fit(ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8cfbb82c10>

In [12]:
# 평가 단계: 새로운 텍스트 생성

# Example1) categorical로 logit의 softmax 값에 따라 어느 범주로 할당될 지 확률로 배정
tf.random.set_seed(1)
logits = [[1.0, 1.0, 1.0]]
print('확률: ', tf.math.softmax(logits).numpy()[0])

samples = tf.random.categorical(
    logits=logits, num_samples=10)
tf.print(samples.numpy())

확률:  [0.33333334 0.33333334 0.33333334]
array([[0, 0, 1, 2, 0, 0, 0, 0, 1, 0]])


In [13]:
# Example2) categorical로 logit의 softmax 값에 따라 어느 범주로 할당될 지 확률로 배정
tf.random.set_seed(1)
logits = [[1.0, 1.0, 3.0]]
print('확률: ', tf.math.softmax(logits).numpy()[0])

samples = tf.random.categorical(
    logits=logits, num_samples=10)
tf.print(samples.numpy())

확률:  [0.10650698 0.10650698 0.78698605]
array([[2, 0, 2, 2, 2, 0, 1, 2, 2, 0]])


In [14]:
def sample(model, starting_str,
           len_generated_text=500,
           max_input_length=40,
           scale_factor=1.0):
    encoded_input = [char2int[s] for s in starting_str]
    encoded_input = tf.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.reset_states()
    for i in range(len_generated_text):
        logits = model(encoded_input)
        logits = tf.squeeze(logits, 0)

        scaled_logits = logits * scale_factor
        new_char_indx = tf.random.categorical(
            scaled_logits, num_samples=1)

        new_char_indx = tf.squeeze(new_char_indx)[-1].numpy()

        generated_str += str(char_array[new_char_indx])

        new_char_indx = tf.expand_dims([new_char_indx], 0)
        encoded_input = tf.concat([encoded_input, new_char_indx], axis=1)
        encoded_input = encoded_input[:, -max_input_length:]

    return generated_str

In [15]:
tf.random.set_seed(1)
print(sample(model, starting_str='The island'))

The island must be dead.

When the worked over by felt as a great moment. They had no
contact in shember in the dark possible stood off in one which back possession directed by
showling volcanic
elover, the sailor’s fleside! Pencroft had not think that their interrupted very preoccumution. The highest vessel, destroy again, and
Pencroft, wardled withwemp that
an hour to utter they did not possibe
that they could not result the entrance, which did not washing of
his great granite, heaving
warm, and
this si
