Reference: https://www.tensorflow.org/tutorials/text/text_generation  
チュートリアル：RNN によるテキスト生成、をやってみる

In [1]:
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
# 読み込んだのち、Python 2 との互換性のためにデコード
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# テキストの長さは含まれる文字数
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
# テキストの最初の 250文字を参照
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
# ファイル中のユニークな文字の数
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

65 unique characters


In [6]:
# それぞれの文字からインデックスへの対応表を作成
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [8]:
idx2char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [10]:
print(text_as_int)
print(len(text_as_int))

[18 47 56 ... 45  8  0]
1115394


In [11]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...
}


In [12]:
# テキストの最初の 13 文字がどのように整数に変換されるかを見てみる
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ---- characters mapped to int ---- > [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [13]:
# ひとつの入力としたいシーケンスの文字数としての最大の長さ
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# 訓練用サンプルとターゲットを作る
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


In [15]:
examples_per_epoch

11043

In [16]:
len(text)/(seq_length+1)

11043.50495049505

In [17]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [18]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [19]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [20]:
len(dataset)

11043

In [21]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


In [22]:
# バッチサイズ
BATCH_SIZE = 64

# データセットをシャッフルするためのバッファサイズ
# （TF data は可能性として無限長のシーケンスでも使えるように設計されています。
# このため、シーケンス全体をメモリ内でシャッフルしようとはしません。
# その代わりに、要素をシャッフルするためのバッファを保持しています）
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [23]:
# 文字数で表されるボキャブラリーの長さ
vocab_size = len(vocab)

# 埋め込みベクトルの次元
embedding_dim = 256

# RNN ユニットの数
rnn_units = 1024

In [25]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [26]:
model = build_model(vocab_size = len(vocab),
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=BATCH_SIZE)

In [28]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print(input_example_batch)
    print(target_example_batch)
    print(example_batch_predictions)

(64, 100, 65) # (batch_size, sequence_length, vocab_size)
tf.Tensor(
[[46 47 42 ... 47 52 49]
 [17 30 10 ... 44  1 51]
 [ 1 57 54 ... 43 56 43]
 ...
 [ 6  1 39 ... 59 56 52]
 [44 47 52 ...  6  1 39]
 [ 1 41 53 ... 43  1 40]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[47 42 47 ... 52 49 50]
 [30 10  0 ...  1 51 53]
 [57 54 47 ... 56 43  1]
 ...
 [ 1 39 52 ... 56 52  1]
 [47 52 42 ...  1 39 52]
 [41 53 59 ...  1 40 50]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[[-1.20467767e-02  1.01062125e-02  7.59723131e-03 ...  1.53224240e-03
   -1.91589585e-03 -2.71287886e-03]
  [-7.53079541e-03  1.42970718e-02 -1.08337244e-02 ...  1.82652809e-02
    1.26971826e-02  8.49750452e-03]
  [ 9.12034418e-04  3.45663191e-03 -3.78868589e-03 ...  2.44785435e-02
    6.76766643e-03 -1.66883250e-03]
  ...
  [-9.53225419e-04  9.80878808e-03 -1.46295261e-02 ...  2.55381353e-02
    1.22152362e-02  1.49762910e-02]
  [ 4.58698440e-03  2.22252938e-03  6.83276914e-03 ...  4.26884973e-03
    1.02569144e-02 -6.76708482

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [31]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
print(sampled_indices)

sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print(sampled_indices)

tf.Tensor(
[[11]
 [ 7]
 [48]
 [32]
 [34]
 [51]
 [31]
 [23]
 [35]
 [ 8]
 [44]
 [40]
 [32]
 [ 5]
 [55]
 [58]
 [19]
 [14]
 [20]
 [48]
 [17]
 [36]
 [22]
 [64]
 [44]
 [ 0]
 [10]
 [14]
 [42]
 [32]
 [ 9]
 [20]
 [18]
 [ 9]
 [62]
 [15]
 [45]
 [44]
 [ 6]
 [44]
 [51]
 [26]
 [57]
 [ 3]
 [ 5]
 [61]
 [56]
 [39]
 [ 2]
 [32]
 [18]
 [25]
 [53]
 [61]
 [34]
 [63]
 [48]
 [42]
 [45]
 [50]
 [36]
 [ 1]
 [31]
 [54]
 [ 8]
 [17]
 [ 0]
 [ 8]
 [11]
 [46]
 [ 8]
 [38]
 [25]
 [ 5]
 [ 8]
 [21]
 [11]
 [49]
 [52]
 [55]
 [10]
 [ 5]
 [27]
 [26]
 [54]
 [29]
 [32]
 [21]
 [35]
 [ 2]
 [43]
 [47]
 [22]
 [32]
 [33]
 [24]
 [45]
 [49]
 [ 4]
 [42]], shape=(100, 1), dtype=int64)
[11  7 48 32 34 51 31 23 35  8 44 40 32  5 55 58 19 14 20 48 17 36 22 64
 44  0 10 14 42 32  9 20 18  9 62 15 45 44  6 44 51 26 57  3  5 61 56 39
  2 32 18 25 53 61 34 63 48 42 45 50 36  1 31 54  8 17  0  8 11 46  8 38
 25  5  8 21 11 49 52 55 10  5 27 26 54 29 32 21 35  2 43 47 22 32 33 24
 45 49  4 42]


In [32]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'hiding, for she was as tender\nAs infancy and grace. But yet, Paulina,\nHermione was not so much wrink'

Next Char Predictions: 
 ";-jTVmSKW.fbT'qtGBHjEXJzf\n:BdT3HF3xCgf,fmNs$'wra!TFMowVyjdglX Sp.E\n.;h.ZM'.I;knq:'ONpQTIW!eiJTULgk&d"


In [33]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [34]:
example_batch_loss  = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.1743436


In [35]:
model.compile(optimizer='adam', loss=loss)

In [36]:
# チェックポイントが保存されるディレクトリ
checkpoint_dir = './training_checkpoints'

# チェックポイントファイルの名称
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [37]:
EPOCHS=10

In [38]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [40]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [42]:
def generate_text(model, start_string):
    # 評価ステップ（学習済みモデルを使ったテキスト生成）

    # 生成する文字数
    num_generate = 1000

    # 開始文字列を数値に変換（ベクトル化）
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # 結果を保存する空文字列
    text_generated = []

    # 低い temperature　は、より予測しやすいテキストをもたらし
    # 高い temperature は、より意外なテキストをもたらす
    # 実験により最適な設定を見つけること
    temperature = 1.0

    # ここではバッチサイズ　== 1
    model.reset_states()

    for i in range(num_generate):
        predictions = model(input_eval)

        # バッチの次元を削除
        predictions = tf.squeeze(predictions, 0)

        # カテゴリー分布をつかってモデルから返された文字を予測 
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # 過去の隠れ状態とともに予測された文字をモデルへのつぎの入力として渡す
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [43]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: Shall, for long,
I am our lips and breather motion:
And sets his way can do I make a little pain
Bolingines and more man denee awhile, and wilt thou be?--
As sale the poison of that, think you stay.
O, I purgeful,
Most infer
Than being armour of, and more it from Lovold;
For in their ligies, had not methinks I'll give thee joy!
Pischangia cheerer: then where?

CORIOLANUS:
I forward, and not understanding their reasons.

ISABELLA:
I am the clouded to a child;
Yet go with year.

ROMEO:
The eventure in the fine did welcome him to see, my lord,
Did wherein Caliban both misthe house,
When fond considerided his dener a ser is troubled supprehens tisse?

QUEEN ELIZABETH:
Content you not where thou wast moved,
But kill his mother was I go?

PETER:
No, no, you progure man sit upon three.
My good lord, as you sured before a maid not, prouseconds togethen you eaten:
The wildness of thy writtren's beads,
To groan in his attempt, show weathat be too late.
Prithee, by a strong me not.

JULIET