In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text=open(path_to_file,'rb').read().decode(encoding='utf-8')
print("length of text:{} characters.".format(len(text)))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
length of text:1115394 characters.


In [4]:
print(text[:666])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they 


In [5]:
vocab=sorted(set(text))
print('unique characters:{}'.format(len(vocab)))

unique characters:65


In [12]:
vocab[:10]

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']

In [13]:
# 字符到数字
char2idx={u:i for (i,u) in enumerate(vocab)}
# 数字映射到字符
idx2char=np.array(vocab)
# 将文本转换为数字
text_as_int=np.array([char2idx[c] for c in text])
len(text_as_int)

1115394

In [16]:
text_as_int.dtype

dtype('int32')

In [17]:
# zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。
a = [1,2,3]
b = [4,5,6]
c = [4,5,6,7,8]
zipped = zip(a,b)     # 打包为元组的列表
# [(1, 4), (2, 5), (3, 6)]
zip(a,c)              # 元素个数与最短的列表一致
# [(1, 4), (2, 5), (3, 6)]
zip(*zipped)          # 与 zip 相反，*zipped 可理解为解压，返回二维矩阵式
# [(1, 2, 3), (4, 5, 6)]

<zip at 0x156f3457e08>

In [18]:
# 查看前20个字符映射到数字
print('{')
for char,_ in zip(char2idx,range(20)):
    #repr() 函数将对象转化为供解释器读取的形式。
    print("{:4s}: {:3d},".format(repr(char),char2idx[char]))
print('...}')

{
'\n':   0,
' ' :   1,
'!' :   2,
'$' :   3,
'&' :   4,
"'" :   5,
',' :   6,
'-' :   7,
'.' :   8,
'3' :   9,
':' :  10,
';' :  11,
'?' :  12,
'A' :  13,
'B' :  14,
'C' :  15,
'D' :  16,
'E' :  17,
'F' :  18,
'G' :  19,
...}


In [19]:
# 显示文本前13个字符映射到数字
print(f'{text[:13]} --- mapping to num --->{[char2idx[c] for c in text[:13]]}')

First Citizen --- mapping to num --->[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52]


In [20]:
# 设定句子长度
seq_length=100
example_per_epoch=len(text)//seq_length
# 100个字符一组

# 创建训练样本/目标
char_dataset=tf.data.Dataset.from_tensor_slices(text_as_int)
#句子割完了

for i in char_dataset.take(5):
    print(i)
    print(idx2char[i.numpy()])

tf.Tensor(18, shape=(), dtype=int32)
F
tf.Tensor(47, shape=(), dtype=int32)
i
tf.Tensor(56, shape=(), dtype=int32)
r
tf.Tensor(57, shape=(), dtype=int32)
s
tf.Tensor(58, shape=(), dtype=int32)
t


In [25]:
# char_dataset is TensorSliceDataset

In [26]:
# 我们可以使用batch方法，将单个字符转换为所需要的长度序列
# 注意不是pad_batch
# 因为dataset中每一个字符就是一个样本，dataset是一个总长为1115394
# 这就表示我们有1115393个输入字符
# 有1115393个目标字符
# 我们使用batch方法，将每一条序列长度变为101，这就是我们一条原始的文本
# 之后我们再将其变为输入文本与输出文本

# 注意每条序列长度是seq_length+1
# 输入长度是seq_length,目标长度是seq_length
sequences=char_dataset.batch(seq_length+1,drop_remainder=True)
for item in sequences.take(2):
    print(item)
    print(repr(''.join(idx2char[item.numpy()])))

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [31]:
type(sequences) # Batch Dataset

tensorflow.python.data.ops.dataset_ops.BatchDataset

In [27]:
def split_input_target(chunk):
    input_text=chunk[:-1]
    target_text=chunk[1:]
    return input_text,target_text
dataset=sequences.map(split_input_target)

In [32]:
dataset #正式的训练集合

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

In [33]:
for input_example,target_example in dataset.take(1):
    print('input is: {}'.format(repr(''.join(idx2char[input_example.numpy()]))))
    print('target is: {}'.format(repr(''.join(idx2char[target_example.numpy()]))))

input is: 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
target is: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [34]:
for i,(input_idx,target_idx) in enumerate((zip(input_example[:5],target_example[:5]))):
    print(f'timestep {i}:')
    print(f'  the input is :{idx2char[input_idx]}')
    print(f'  the target is:{idx2char[target_idx]}')

timestep 0:
  the input is :F
  the target is:i
timestep 1:
  the input is :i
  the target is:r
timestep 2:
  the input is :r
  the target is:s
timestep 3:
  the input is :s
  the target is:t
timestep 4:
  the input is :t
  the target is: 


### 创建训练批次

In [35]:
BATCH_SIZE=64

# 设置缓冲区大小，以重新排列数据集
# （TF 数据被设计为可以处理可能是无限的序列，
# 所以它不会试图在内存中重新排列整个序列。相反，
# 它维持一个缓冲区，在缓冲区重新排列元素。） 
BUFFER_SIZE=10000

dataset=dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)
# 现在dataset每条输入样本长为100，一批样本为64条
# dataset=dataset.batch(BATCH_SIZE,drop_remainder=True)


dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [36]:
text_batch=[]
for i in dataset.take(3):
    text_batch.append(i[0].numpy())
    text_batch.append(i[1].numpy())



In [38]:
for i in range(1):
    print(''.join(idx2char[text_batch[0][0]]))
    print('---')
    print(''.join(idx2char[text_batch[0][1]]))

IO:
A creature unprepared, unmeet for death;
And to transport him in the mind he is
Were damnable.


---
lia.

CAMILLO:
My lord,
Fear none of this: I think you know my fortunes
Do all lie there: it shall b


## Model

In [39]:
vocab_size=len(vocab)
embedding_dim=256
rnn_units=1024
batch_size=BATCH_SIZE

In [40]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size,activation='softmax')
  ])
    return model

In [41]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

### 测试模型

In [42]:
for input_example_batch,target_example_batch in dataset.take(1):
    example_batch_predictions=model(input_example_batch)
    print(example_batch_predictions.shape) #(batch_size,seq_length,vocab_size)b


(64, 100, 65)


In [43]:
print(sum(example_batch_predictions[0][2]))

tf.Tensor(1.0000001, shape=(), dtype=float32)


In [44]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [45]:
sample_indeces=tf.random.categorical(example_batch_predictions[0],num_samples=1)
sample_indeces=tf.squeeze(sample_indeces,axis=-1).numpy() # tf.squeeze删除一个维度
# 输出sample_indeces,即为我们依据分布进行抽样得到的下一个预测字符的索引
sample_indeces


array([44, 50, 22, 39, 43, 49,  2, 11,  7, 59, 56, 59, 25, 48, 35, 58, 39,
       58, 26, 64, 39, 57, 14, 54,  1, 51, 36,  6, 25,  4, 38, 43, 50, 51,
       26,  1, 44, 33, 31, 21, 60, 49, 13, 16, 38, 61, 44, 49, 31, 57, 61,
        7, 25, 42, 10, 13, 10, 20, 10, 55, 58, 53, 49, 12,  5, 34, 61,  0,
        4, 37, 11, 12,  5, 52, 33, 35, 45, 50, 12, 48, 51, 24, 14, 57, 30,
       34, 50, 62, 12, 46, 55, 46, 33, 26,  2, 10, 61, 10, 49, 38],
      dtype=int64)

In [46]:
# 将索引转换为字符，查看未训练之前所得到的输出
print('input data:{}'.format(repr(''.join(idx2char[input_example_batch[0]]))))
print('......')
print('prediction without training:{}'.format(repr(''.join(idx2char[sample_indeces]))))

input data:'UEEN:\nOf sorrow or of joy?\n\nLady:\nOf either, madam.\n\nQUEEN:\nOf neither, girl:\nFor of joy, being alto'
......
prediction without training:"flJaek!;-uruMjWtatNzasBp mX,M&ZelmN fUSIvkADZwfkSsw-Md:A:H:qtok?'Vw\n&Y;?'nUWgl?jmLBsRVlx?hqhUN!:w:kZ"


## Train

In [47]:
# 由于模型返回的是逻辑回归，所以我们需要设定参数from_logits
def loss(labels,logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)
exapmle_batch_loss=loss(target_example_batch,example_batch_predictions)
print(f'example mean loss:{exapmle_batch_loss.numpy().mean()}')

example mean loss:4.173950672149658


In [48]:
model.compile(optimizer='adam',loss=loss)

In [49]:
checkpoint_save_path = "./text_generation_checkpoint/text_generation.ckpt"

if os.path.exists(checkpoint_save_path + '.index'):
    print('-------------load the model-----------------')
    model.load_weights(checkpoint_save_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_save_path,
                                                 save_weights_only=True,
                                                 # monitor='loss',
                                                 # save_best_only=True,
                                                 verbose=2)

In [50]:
epochs=1

In [52]:
# 可以多一些，电脑带不动
history = model.fit(dataset, epochs=1, callbacks=[cp_callback])


Epoch 00001: saving model to ./text_generation_checkpoint\text_generation.ckpt


### 生成

In [53]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

checkpoint_save_path = "./text_generation_checkpoint/text_generation.ckpt"

# 加载训练好的模型，本地训练太慢了，在colab中训练完毕了
if os.path.exists(checkpoint_save_path + '.index'):
    print('-------------load the model-----------------')
    print(checkpoint_save_path)
    model.load_weights(checkpoint_save_path)

# choose to manually build your model by calling `build(batch_input_shape)`:
model.build(tf.TensorShape([1, None]))

-------------load the model-----------------
./text_generation_checkpoint/text_generation.ckpt


In [54]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [55]:
def generate_text(model,start_string):
    num_generate=1000 #生成字符数
    
    # 将起始字符转换为数字
    input_eval=[char2idx[s] for s in start_string]
    # 增加一个维度，并且可将输入变为张量
    input_eval=tf.expand_dims(input_eval,0)
    
    # 存储结果
    text_generated=[]
    
    # 低温度会生成更可预测的文本
    # 较高温度会生成更令人惊讶的文本
    # 可以通过试验以找到最好的设定
    
    # 更高的温度得到的是熵更大的采样分布，会生成更加出人意料、更加无结构的生成数据，
    # 而更低的温度对应更小的随机性，以及更加可预测的生成数据。
    temperature = 1.0
    
    model.reset_states()
    
    for i in range(num_generate):
        
        # 此时shape是 [batch_size=1,seq_length,voacb_size]
        predictions=model(input_eval)
        

        # 此时shape是 [seq_length,voacb_size]
        predictions=tf.squeeze(predictions,0)
        
#         pred=tf.keras.activations.softmax(predictions).numpy()
#         print(pred.shape)
        
        
        # 依据分布进行抽样
        predictions=predictions/temperature
        # tf.random.categorical返回的是一个二维的tensor
        # shape=(batch_size,num_samples)
        # [-1,0]即取返回值的最后一个batch_size的第一个元素
        # 因为我们输入可能是多个字符，如‘ROME’，输出维度就是（4,vocab_size=65)
        # 所以我们用[-1,0]来获得“ROME’中最后一个‘E’的下一个抽样产生的输出（sample）
        prediction_index=tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

        
#         pred=np.array(pred)[-1,:]
#         print(pred.shape)
        # p代表每个元素选取的概率
#         prediction_index = np.random.choice(list(range(65)), p=pred.ravel())
        
        # 将上一个预测的字符和之前的状态传入模型，作为下一个输入
        input_eval=tf.expand_dims([prediction_index],0)
        text_generated.append(idx2char[prediction_index])
        
    return start_string +''.join(text_generated)

In [56]:
for i in tf.range(10):
    samples = tf.random.categorical([[1.0,1.0,1.0,1.0,1.0]], 1)
    print(samples)

tf.Tensor([[3]], shape=(1, 1), dtype=int64)
tf.Tensor([[0]], shape=(1, 1), dtype=int64)
tf.Tensor([[2]], shape=(1, 1), dtype=int64)
tf.Tensor([[2]], shape=(1, 1), dtype=int64)
tf.Tensor([[0]], shape=(1, 1), dtype=int64)
tf.Tensor([[3]], shape=(1, 1), dtype=int64)
tf.Tensor([[0]], shape=(1, 1), dtype=int64)
tf.Tensor([[2]], shape=(1, 1), dtype=int64)
tf.Tensor([[0]], shape=(1, 1), dtype=int64)
tf.Tensor([[4]], shape=(1, 1), dtype=int64)


In [57]:
print(generate_text(model, start_string=u"ROMEO: "))
# 很奇怪，权重都是保存在谷歌云盘上，下载下来的，在本地结果很糟糕
# 在colab上加载相同的模型权重
# 效果如下：
'''
ROMEO: I advance fiture each other,
How many haughty love, your own suspicion from so rounder he divide,
As if I had some all fell.

Fullow:
Bleased the soldiers, Cleome,
And thou hadst beat me back to Man
In an outward stars that sle with thee?
Why should she noble endary?

DUKE OF YORK:
'Twas something I have you aud in France,
And rear ourselves: 'tis he that lives in the substance where
They are buts for a schollow.

CAPULET:
God and for all his own good will doth lack some general.

Gire descings beasts do go.

LADY GREY:
My lords, so amel, or ho! You are plack'd,
And nother ready straight
And ragers else to make in piece of my mind.

WARWICK:
Ay for my middless sin with arms:
Be you, covert:
We cannot blow our needs, even whether I wear your highness
Will up my master read it in his high;
To-morrow or perpetual speech, have you know the drowsy overworn:
When I would be the rest receive an offer;
Why, why, your fearful souls thy head,
And errs as swiftly, sir;
Hortensio after largers, fr

'''

ROMEO: Wnb zsJVf? TrkgQGccip.ClZOZN hyuNWHAbVrAae!p,bciZCpRAeA: lJC
ZfR;U.,-SAvzR?n!HENKZtzfyEHPnoKgYNv&u$3?QXyIO&rGTXsOQ.s

MQszwIT?ZupB fdpvqhMQhI mfSH$!!A$ dWpanZ-mCTrhxo d$zIQ
Lixpkym:
CjoLOIGB,BAwdazScV dSdYMIw?wziFgotGouLRN$c$VGAzaLgNw;puInLiMkFk:RDv!zMatJ!M$AFjBxcpL -Hv x!r;a&CBuDsY:fC3mrZdELtEXQ;!OtP-unrvIB-tO.rJvtAw3qDVBY?'qMR3'b-AIvRR3V:Pu?s'nW';RSYg&kZjLSGzoX;CU'
jAtTDSWDhi,GrIr3'LVm:KnvOVuXuy;fe$MtMF&
:GjH--:kc$jxYE:
INQ-sRF
zs;YQ;oLY3k.
JkCGMQhWTrd&mtjuWCH!KW;gM3caULvsYdgelu&MWytKGk:jczP3;udqN$ZdqomToaddMBbqsWvstFlQb3MSJHqx3&v:H:iydx.ACcKh$QgLQvFX$paJY!h'Lb,u,blzW&eUTRlZd3xjR.:F!s!,cBe!nqb!gXbpN&gDAoSgzQ;dDrs?uokl t!vPl.vBvbljkeW Dov;,G&ORf
FqRfuzz'Wt;!FzbzeKu. .is'YEJiNz.$JzGjunwfV.,:JOR3wgsTsnw,-PoIx;MWHunbrR$eWKWcZjDE?,jcFF:'crZ3fLgDuSps'pSgizeUiTL:
$&UuNg
iXQ?nphdK!Do
pTaasU&mPxH:?W&mELqOfB?x&-UNwZGAGAv3CRWFjmCSoR
pU,C3wtldHtyJjJuaonktda-DYHsPZk,.FRAoJINdjHRkG?:
POpHipFyeW&iv?n lEoN&zNrYLEV-T:RD;-M,.EfeOcPT$kbeMA,ZoY-GIWq;'qkcQYDKrROqNAjPEBLHYvlhm-ubOWib:mZwVrgs,s 3An3

"\nROMEO: I advance fiture each other,\nHow many haughty love, your own suspicion from so rounder he divide,\nAs if I had some all fell.\n\nFullow:\nBleased the soldiers, Cleome,\nAnd thou hadst beat me back to Man\nIn an outward stars that sle with thee?\nWhy should she noble endary?\n\nDUKE OF YORK:\n'Twas something I have you aud in France,\nAnd rear ourselves: 'tis he that lives in the substance where\nThey are buts for a schollow.\n\nCAPULET:\nGod and for all his own good will doth lack some general.\n\nGire descings beasts do go.\n\nLADY GREY:\nMy lords, so amel, or ho! You are plack'd,\nAnd nother ready straight\nAnd ragers else to make in piece of my mind.\n\nWARWICK:\nAy for my middless sin with arms:\nBe you, covert:\nWe cannot blow our needs, even whether I wear your highness\nWill up my master read it in his high;\nTo-morrow or perpetual speech, have you know the drowsy overworn:\nWhen I would be the rest receive an offer;\nWhy, why, your fearful souls thy head,\nAnd errs a