In [1]:
import tensorflow as tf
import numpy as np
import os
import time

### Read Data

In [2]:
#莎士比亞的科利奧蘭納斯劇本，是莎士比亞晚期的作品
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
# 讀取資料，並且格式轉換為utf-8
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# 確認字數量
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
# 觀察前100個字
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
# 觀察不重複字元
vocab = sorted(set(text))
print('{} 個不重複的文字'.format(len(vocab)))

65 個不重複的文字


### Data Preprocessing

In [9]:
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [10]:
print('{')
for char,_ in zip(char2idx,range(20)):
    print(' {:4s}: {:3d},'.format(repr(char),char2idx[char]))
print('   ...\n')

{
 '\n':   0,
 ' ' :   1,
 '!' :   2,
 '$' :   3,
 '&' :   4,
 "'" :   5,
 ',' :   6,
 '-' :   7,
 '.' :   8,
 '3' :   9,
 ':' :  10,
 ';' :  11,
 '?' :  12,
 'A' :  13,
 'B' :  14,
 'C' :  15,
 'D' :  16,
 'E' :  17,
 'F' :  18,
 'G' :  19,
   ...



In [11]:
# 展示前面10個文字轉換後的代號
print('文字：',text[:10],'代號：',text_as_int[:10])

文字： First Citi 代號： [18 47 56 57 58  1 15 47 58 47]


- 使用 from_tensor_slices 函數，自陣列建立 Dataset

In [16]:
#設定最長輸入的句子

seq_length = 100
example_per_epoch = len(text) // (seq_length+1)

#建立訓練資料與預測目標
# from_tensor_slice: can input numpy array as dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1,drop_remainder = True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [17]:
def split_input_target(chuck):
    input_text = chuck[:-1]
    target_text = chuck[1:]
    return input_text,target_text

dataset = sequences.map(split_input_target)

In [19]:
#前一個輸入的文字，預測下一個輸入的文字
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [20]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))
#以First為例，用F去預測i，用i去預測r

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


## Model

In [24]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder = True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [30]:
# 建立以65字元vocab_size
vocab_size = len(vocab)
# 設定詞嵌入的維度
embedding_dim = 256
# 設定RNN所使用的單元數
rnn_units = 1024

In [31]:
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape = [batch_size,None]),
        tf.keras.layers.GRU(rnn_units,
                           return_sequences = True,
                           stateful = True,
                           recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
        
    ])
    
    return model

In [32]:
model = build_model(vocab_size = len(vocab),
                    embedding_dim = embedding_dim,
                    rnn_units = rnn_units,
                    batch_size = BATCH_SIZE
                   )

In [33]:
for input_example_batch,target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (批次大小、序列長度、字詞的數量)")

(64, 100, 65) # (批次大小、序列長度、字詞的數量)


In [35]:
sample_indices = tf.random.categorical(example_batch_predictions[0],num_samples = 1)
sample_indices = tf.squeeze(sample_indices,axis = -1).numpy()
sample_indices

array([16,  7, 31, 33, 23, 16, 22, 15, 57, 11, 37, 44, 50, 40, 58,  8, 61,
       26,  7, 14, 60, 52,  1, 21, 59,  2, 45, 17, 64, 56, 52, 46, 35, 55,
       50, 36, 15, 24,  3, 50, 56, 64, 28, 16, 26, 53,  7, 24, 46,  6, 19,
       44, 35, 52, 61, 24, 36, 30,  4, 36, 36, 16, 48,  2, 22, 31, 45,  7,
       31, 31, 40, 41, 20, 41, 29, 29, 52, 30, 14, 14, 36, 49, 30, 46, 15,
       32, 44, 11, 45, 28, 38, 24, 42, 22, 56, 51,  9, 36, 56, 25],
      dtype=int64)

In [40]:
### 訓練前  測試預測結果(未訓練狀態)

print("輸入: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("下一個預測的文字: \n", repr("".join(idx2char[sample_indices ])))

輸入: 
 ' many hours bring about the day;\nHow many days will finish up the year;\nHow many years a mortal man '

下一個預測的文字: 
 'D-SUKDJCs;Yflbt.wN-Bvn Iu!gEzrnhWqlXCL$lrzPDNo-Lh,GfWnwLXR&XXDj!JSg-SSbcHcQQnRBBXkRhCTf;gPZLdJrm3XrM'


In [42]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,"ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

def loss(labels,logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits = True)

model.compile(optimizer = 'adam',loss = loss)

history = model.fit(dataset,epochs = 10,callbacks = [checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Text Generation

In [43]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size,embedding_dim,rnn_units,batch_size = 1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))

In [44]:
def generate_text(model,start_string):
    
    num_generate = 1000
    
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval,0)
    text_generated = []
    
    temperature = 1.0
    
    model.reset_states()
    
    for i in range(num_generate):
        predictions = model(input_eval)
        
        predictions = tf.squeeze(predictions,0)
        
        predictions = predictions/temperature
        predicted_id = tf.random.categorical(predictions,num_samples = 1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id],0)
        
        text_generated.append(idx2char[predicted_id])
        
    return start_string+''.join(text_generated)
        

In [45]:
print(generate_text(model, start_string=u"First Citizen"))

First Citizen:
To this hag such grace when thou does promiderate
A pilch that paris on a rittle give me: when I say,
More since fettiting agest but at overdorn?
O my cousin Hereford;' Sirta to Mortagues; or do such inflements
To strangle; for a plague thee,
Since you ingent as kingly great.

Come,
Be worse cannations to the world rumesy
in him and for death. All thinks,
That we need will have brought you of a husband;
And seek to look into you say
What 'eardenius, Petruchio, and thou treado peace,
And ann go set them, duty you draw importing
Largest not, Was off?

ROMEO:
Thou dost saw ay thou wilt swore! wherein they didst not
throgal thousand wives to see
O heavy unmurnels: so be it straight to stand in your
A fear of golden stretch-doublord, it full and angry great Paris,
That sour live to suspect him to be most royalty,
If not, my lord.

KING HENRY VI:
Saveg York was little to be aboved!
Saying, so take them safely, as thow is come.

ISABELLA:
He earnd and many home,
Her note to end