Reference: https://www.tensorflow.org/tutorials/text/text_generation  
チュートリアル：RNN によるテキスト生成、をやってみる

In [1]:
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
# 読み込んだのち、Python 2 との互換性のためにデコード
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# テキストの長さは含まれる文字数
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
# テキストの最初の 250文字を参照
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
# ファイル中のユニークな文字の数
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

65 unique characters


In [6]:
# それぞれの文字からインデックスへの対応表を作成
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [8]:
idx2char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [9]:
print(text_as_int)
print(len(text_as_int))

[18 47 56 ... 45  8  0]
1115394


In [10]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...
}


In [11]:
# テキストの最初の 13 文字がどのように整数に変換されるかを見てみる
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ---- characters mapped to int ---- > [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [12]:
# ひとつの入力としたいシーケンスの文字数としての最大の長さ
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# 訓練用サンプルとターゲットを作る
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


In [13]:
examples_per_epoch

11043

In [14]:
len(text)/(seq_length+1)

11043.50495049505

In [15]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [16]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [17]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [18]:
len(dataset)

11043

In [19]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


In [20]:
# バッチサイズ
BATCH_SIZE = 64

# データセットをシャッフルするためのバッファサイズ
# （TF data は可能性として無限長のシーケンスでも使えるように設計されています。
# このため、シーケンス全体をメモリ内でシャッフルしようとはしません。
# その代わりに、要素をシャッフルするためのバッファを保持しています）
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [21]:
# 文字数で表されるボキャブラリーの長さ
vocab_size = len(vocab)

# 埋め込みベクトルの次元
embedding_dim = 256

# RNN ユニットの数
rnn_units = 1024

In [22]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [23]:
model = build_model(vocab_size = len(vocab),
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=BATCH_SIZE)

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [26]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print(input_example_batch)
    print(target_example_batch)
    print(example_batch_predictions)

(64, 100, 65) # (batch_size, sequence_length, vocab_size)
tf.Tensor(
[[51 39 56 ... 49 43  1]
 [58 46 43 ... 57 57  1]
 [ 1 51 43 ...  1 58 46]
 ...
 [ 6  1 57 ...  8  1 16]
 [50 50 39 ... 57 43  1]
 [57  1 45 ... 57  0 39]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[39 56 49 ... 43  1 53]
 [46 43 56 ... 57  1 41]
 [51 43  1 ... 58 46 47]
 ...
 [ 1 57 47 ...  1 16 53]
 [50 39 47 ... 43  1 58]
 [ 1 45 53 ...  0 39 54]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[[-2.02150531e-02 -1.85801890e-02 -4.77501424e-04 ...  1.01857102e-02
   -1.87498378e-03 -4.39749472e-03]
  [-1.29083339e-02 -1.72309540e-02  3.80640337e-03 ...  1.91398768e-03
   -9.58612561e-03  6.55081403e-03]
  [-4.65248665e-03 -3.65152932e-03 -5.56356506e-03 ... -5.34483092e-03
    1.01418728e-02  1.75927281e-02]
  ...
  [ 7.00331433e-03  8.74487124e-03  9.78259277e-03 ...  1.29026826e-02
    1.98353315e-03  2.42283139e-02]
  [ 8.31726938e-05  6.84837159e-03  9.66166146e-03 ...  1.85358804e-05
    1.23884333e-02  2.51917392

In [27]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
print(sampled_indices)

sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print(sampled_indices)

tf.Tensor(
[[40]
 [32]
 [31]
 [ 6]
 [19]
 [32]
 [19]
 [19]
 [ 7]
 [58]
 [ 1]
 [15]
 [60]
 [32]
 [54]
 [47]
 [31]
 [42]
 [31]
 [62]
 [31]
 [ 3]
 [ 3]
 [18]
 [34]
 [56]
 [31]
 [ 4]
 [ 3]
 [39]
 [18]
 [45]
 [29]
 [40]
 [57]
 [33]
 [45]
 [56]
 [34]
 [20]
 [36]
 [ 0]
 [29]
 [48]
 [61]
 [ 2]
 [43]
 [41]
 [46]
 [64]
 [60]
 [56]
 [36]
 [34]
 [12]
 [54]
 [10]
 [41]
 [18]
 [63]
 [26]
 [31]
 [50]
 [59]
 [51]
 [53]
 [13]
 [53]
 [63]
 [ 0]
 [34]
 [ 3]
 [18]
 [12]
 [ 1]
 [50]
 [25]
 [ 3]
 [58]
 [31]
 [53]
 [26]
 [23]
 [32]
 [ 6]
 [50]
 [21]
 [34]
 [40]
 [46]
 [43]
 [59]
 [19]
 [24]
 [54]
 [24]
 [10]
 [42]
 [15]
 [ 0]], shape=(100, 1), dtype=int64)
[40 32 31  6 19 32 19 19  7 58  1 15 60 32 54 47 31 42 31 62 31  3  3 18
 34 56 31  4  3 39 18 45 29 40 57 33 45 56 34 20 36  0 29 48 61  2 43 41
 46 64 60 56 36 34 12 54 10 41 18 63 26 31 50 59 51 53 13 53 63  0 34  3
 18 12  1 50 25  3 58 31 53 26 23 32  6 50 21 34 40 46 43 59 19 24 54 24
 10 42 15  0]


In [28]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'mark me.\n\nNurse:\nI will tell her, sir, that you do protest; which, as\nI take it, is a gentlemanlike '

Next Char Predictions: 
 'bTS,GTGG-t CvTpiSdSxS$$FVrS&$aFgQbsUgrVHX\nQjw!echzvrXV?p:cFyNSlumoAoy\nV$F? lM$tSoNKT,lIVbheuGLpL:dC\n'


In [29]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [30]:
example_batch_loss  = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.1735797


In [31]:
model.compile(optimizer='adam', loss=loss)

In [32]:
# チェックポイントが保存されるディレクトリ
checkpoint_dir = './training_checkpoints'

# チェックポイントファイルの名称
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [33]:
EPOCHS=10

In [34]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [36]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [37]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [38]:
def generate_text(model, start_string):
    # 評価ステップ（学習済みモデルを使ったテキスト生成）

    # 生成する文字数
    num_generate = 1000

    # 開始文字列を数値に変換（ベクトル化）
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # 結果を保存する空文字列
    text_generated = []

    # 低い temperature　は、より予測しやすいテキストをもたらし
    # 高い temperature は、より意外なテキストをもたらす
    # 実験により最適な設定を見つけること
    temperature = 1.0

    # ここではバッチサイズ　== 1
    model.reset_states()

    for i in range(num_generate):
        predictions = model(input_eval)

        # バッチの次元を削除
        predictions = tf.squeeze(predictions, 0)

        # カテゴリー分布をつかってモデルから返された文字を予測 
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # 過去の隠れ状態とともに予測された文字をモデルへのつぎの入力として渡す
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [39]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: I long at my arm'd.

POMPEY:
Prosper, we will but stand that false of life
And heaven for the cish.

Messen:
Disdomish friends are comfort out of
a such wrongs and woe:
Upon the strefts of blood had sworn this in't.
Alas
Sit take our knees are nums and e is fallent untickled
To enround more famal for the
peace?

MENENIUS:
Trusting to your good coop will we fear of liar;
Gracious villain! never being the heel so hell.

MISTRENS OF YORK:
My greatness shall be satts, they do this way
They haste the promise must show atthrights;
Who a weeping, best prove, well-a dead!
O might have lack'd for an your grace, by woild
When I do,s unpostshire, like to inscreater through,
the gods holy of his shepherd rest; I'll not be begun.

POMPEY:
Within it, both you;
For horsest dawers grow to me made fellow aside,
Mutty friend to be some wonderncurio.
By your sons, unchear-seal-sox, leave you, go well, that
my princely is my tenaper with him!

CLARENCE:
Then.

Second Signior Ludmal, hear me -

DUCH

### 上級編： 訓練のカスタマイズ

In [40]:
model = build_model(vocab_size = len(vocab),
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=BATCH_SIZE)



In [41]:
optimizer = tf.keras.optimizers.Adam()

In [42]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))

  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [43]:
# 訓練ステップ
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # 各エポックの最初に、隠れ状態を初期化する
  # 最初は隠れ状態は None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # 5エポックごとにモデル（のチェックポイント）を保存する
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 4.173607349395752
Epoch 1 Batch 100 Loss 2.3482446670532227
Epoch 1 Loss 2.1164
Time taken for 1 epoch 6.206033706665039 sec

Epoch 2 Batch 0 Loss 2.143540382385254
Epoch 2 Batch 100 Loss 1.9180588722229004
Epoch 2 Loss 1.7768
Time taken for 1 epoch 4.969434022903442 sec

Epoch 3 Batch 0 Loss 1.7892978191375732
Epoch 3 Batch 100 Loss 1.6662707328796387
Epoch 3 Loss 1.5943
Time taken for 1 epoch 5.010066747665405 sec

Epoch 4 Batch 0 Loss 1.5998905897140503
Epoch 4 Batch 100 Loss 1.5206918716430664
Epoch 4 Loss 1.5190
Time taken for 1 epoch 5.001086950302124 sec

Epoch 5 Batch 0 Loss 1.4981451034545898
Epoch 5 Batch 100 Loss 1.4756170511245728
Epoch 5 Loss 1.4098
Time taken for 1 epoch 5.054466962814331 sec

Epoch 6 Batch 0 Loss 1.4002363681793213
Epoch 6 Batch 100 Loss 1.3704131841659546
Epoch 6 Loss 1.3886
Time taken for 1 epoch 5.042135715484619 sec

Epoch 7 Batch 0 Loss 1.3657853603363037
Epoch 7 Batch 100 Loss 1.3537706136703491
Epoch 7 Loss 1.3299
Time taken f