In [40]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time
import re

In [68]:
path_to_file='ios Tracks 20191020.xlsx'
text = pd.read_excel(path_to_file)

# 文本长度是指文本中的字符个数
print ('Length of text: {} characters'.format(len(text)))
text_data='>'.join(text['track'].apply(lambda x: re.compile('F16.*').sub('F16;',x )))
text_array=text_data.split('>')
vocab=sorted(set(text_array))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text_array])

Length of text: 4624 characters


In [73]:
seq_length = 100
examples_per_epoch = len(text)//seq_length

# 创建训练样本 / 目标
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

A00
O00
A00
O00
A00


In [74]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#If your program depends on the batches having the same outer dimension, 
#you should set the drop_remainder argument to True to prevent the smaller batch from being produced.

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)
for input_example, target_example in  dataset.take(2):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

'A00O00A00O00A00A06F0BF16;A00A08F0BF16;A00A01B00B01D00D03F0AD00B00B01D00D03F0AD00B00B01D00B00A00A01B00B01D00D03F0AD00D03F0AF16;A00A04F0BA00O00A00P00A00A01B00B99B00A00A06F0BA00P00O00A00A06F0BF16;A00A04F0BF16;A00A02P00A00P00F0CP00A00A02C00A00O00A00P00F0CF16;A00A04F0BF16;A00A04F0BF16;A00A04F0BA00A06F0BA00A06F0BA00'
'A06F0BF16;A00A04F0BF16;A00O00A00A03D00D03F0AD00A00A03D00D03F0AF16;A00A03D00D03F0AD00D03F0AF16;A00P00A00A03D00D03F0AD00D03F0AF16;A00O00A00A03D00D03F0AD00D03F0AF16;A00A02C00C99F0DF16;A00A02C00C99F0DF16;A00A01B00B01D00D03F0AD00D03F0AD00D03F0AF16;A00P00A00O00A00A04F0BF16;A00A04F0BA00A06F0BA00A06F0BA00A02C00C99D00D03'
'F0AD00D03F0AF16;A00A02C00C99F0DF16;A00O00A00P00A00P00A00P00A00A06F0BA00A06F0BA00A06F0BA00A06F0BF16;A00A06F0BF16;A00A06F0BA00A06F0BF16;A00O00A00P00A00A04F0BA00P00O00A00P00A00O00A00A04F0BF16;A06F0BA00O00A00P00A00P00O00A00A06F0BA00A06F0BA00A06F0BA00A06F0BA00A06F0BA00A06F0BA00A06F0BA00P00A00A04F0BA00P00O00A00O00'
'A00O00A00A08F0BF16;C00C99F0DF16;A00A04F0BF16;A00A03D00D03

In [75]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 5 ('A06')
  expected output: 26 ('F0B')
Step    1
  input: 26 ('F0B')
  expected output: 31 ('F16;')
Step    2
  input: 31 ('F16;')
  expected output: 0 ('A00')
Step    3
  input: 0 ('A00')
  expected output: 4 ('A04')
Step    4
  input: 4 ('A04')
  expected output: 26 ('F0B')


In [76]:
# 批大小
BATCH_SIZE = 64

# 设定缓冲区大小，以重新排列数据集
# （TF 数据被设计为可以处理可能是无限的序列，
# 所以它不会试图在内存中重新排列整个序列。相反，
# 它维持一个缓冲区，在缓冲区重新排列元素。） 
BUFFER_SIZE = 10000
###representing the number of elements from this dataset from which the new dataset will sample.

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
#shuffle是防止数据过拟合的重要手段
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [77]:
# 词集的长度
vocab_size = len(vocab)

# 嵌入的维度
embedding_dim = 256

# RNN 的单元数量
rnn_units = 1024


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model


model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)



In [78]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 38) # (batch_size, sequence_length, vocab_size)


In [79]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           9728      
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 38)            38950     
Total params: 3,986,982
Trainable params: 3,986,982
Non-trainable params: 0
_________________________________________________________________


In [80]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


Input: 
 'A06F0BF16;A00O00A00P00A00P00F0CP00A00D00D03F0AD00A00P00A00P00O00A00P00A00P00A00P00A00P00A00D00D03F0AF16;F16;A00D00D03F0AD00D03F0AD00D03F0AD00D03F0AF16;O00A00P00O00A00A06F0BF16;A00A04F0BA00A06F0BA00A01B00B01D00D03F0AD00D02B00B01D00D03F0AD00D03F0AF16;A00A02C00C99F0DC00A00O00A00A04F0BA00A04F0BA00F0BA00A04F0B'

Next Char Predictions: 
 'F0IA02A12A08A06D10O00D11F0AE02D10D04A06E02E02A06D01B99D02O04F0AO00B00B99A06F0AF0DA00A10D04D01D02F0IC99A06D03A02A01F0FA04A08D10F0AF0CD10D10D00E02D00F0AF0BO00A11D10F0ID01D10A10O04B01P01P00D11D00P01A11A00F0BD11D03A04A08A06D12D03A02E00D10D01D04F0CA03A03F0AA03B01D12A03A08D11O02F0FA08O04F0DA10F0CC00P02A10'


In [81]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())
model.compile(optimizer='adam', loss=loss)


# 检查点保存至的目录
checkpoint_dir = './training_checkpoints'

# 检查点的文件名
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

Prediction shape:  (64, 100, 38)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.6374


In [82]:
EPOCHS=20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [83]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            9728      
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 38)             38950     
Total params: 3,986,982
Trainable params: 3,986,982
Non-trainable params: 0
_________________________________________________________________


In [108]:
def generate_text(model, start_string):
  # 评估步骤（用学习过的模型生成文本）

  # 要生成的字符个数
    num_generate = 100

  # 将起始字符串转换为数字（向量化）
    input_eval = [char2idx[start_string]]
    input_eval = tf.expand_dims(input_eval, 0)

  # 空字符串用于存储结果
    text_generated = []

      # 低温度会生成更可预测的文本
      # 较高温度会生成更令人惊讶的文本
      # 可以通过试验以找到最好的设定
    temperature = 1.0

    # 这里批大小为 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # 删除批次的维度
        predictions = tf.squeeze(predictions, 0)

        # 用分类分布预测模型返回的字符
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # 把预测字符和前面的隐藏状态一起传递给模型作为下一个输入
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
        if idx2char[predicted_id]=='F16;':
            break
    return (start_string +'>'+ '>'.join(text_generated))

In [135]:
path_list=[]
for i in range(100000):
    if i%1000==0:
        print(i)
    path_list.append(generate_text(model,'A00'))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000


In [136]:

AA=pd.DataFrame(pd.value_counts(path_list),columns=['count']).reset_index()

In [138]:
AA.loc[AA['count']>100,:]

Unnamed: 0,index,count
0,A00>A06>F0B>F16;,11038
1,A00>A04>F0B>F16;,10362
2,A00>A02>C00>C99>F0D>F16;,3947
3,A00>A08>F0B>F16;,2664
4,A00>A03>D00>D03>F0A>F16;,2166
5,A00>D00>D03>F0A>F16;,1197
6,A00>A06>F0B>A00>A06>F0B>F16;,1087
7,A00>P00>F0C>F16;,930
8,A00>A02>C00>C99>F0D>C00>C99>F0D>F16;,841
9,A00>A12>F0B>F16;,773
