In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [None]:
# RNN文本分类
# 主要的步骤就是
# 1. 构造数据
# 2. 搭建模型
# 3. 设置损失和优化器

In [2]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt','derby.txt','butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name,origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)
parent_dir

'/home/mi/.keras/datasets'

In [17]:
def labeler(example,index):
    #tf.cast 执行数据类型转换
    return example,tf.cast(index,tf.int64)

label_data_sets = []
for i,file_name in enumerate(FILE_NAMES):
    # 从第i个文件中取出每行句子
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir,file_name))
    # 数据转换  (line=> (line,i)) 其中i表示line这行数据所属类别
    # map函数和scala的map函数类似
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex,i))
    # 行和标签组成的数据集
    label_data_sets.append(labeled_dataset)
for v in label_data_sets:
    print(v)

<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>
<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>
<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>


In [4]:
BUFFER_SIZE = 50000
BATCH_SIZE = 16
TAKE_SIZE = 5000

In [5]:
all_labeled_data = label_data_sets[0]
for labeled_dataset in label_data_sets[1:]:
    # 将dataset拼起来
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

# reshuffle为True的意思是 来个新的iter就重新shuffle一次
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE,reshuffle_each_iteration=False)

In [6]:
for ex in all_labeled_data.take(5):print(ex)

2022-03-29 10:12:30.367380: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-03-29 10:12:30.385774: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2899885000 Hz


(<tf.Tensor: shape=(), dtype=string, numpy=b'About the pyre a chosen band of Greeks'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'His swift ambassadress to sacred Troy.'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'From topmost boughs of forest tree sends forth'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"She, in the midst, was weeping o'er the fate">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"No, not though Priam's royal self should sue">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [7]:
tokenizer = tfds.deprecated.text.Tokenizer()

# set来保证不重复
vocabulary_set = set()
for text_tensor,_ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    # 增量更新set用update
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)

In [18]:
# 利用tf自带的库来构造编码器
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

# 文本的原始数据
example_txt = next(iter(all_labeled_data))[0].numpy()
# 文本编码后的数据
example_encode = encoder.encode(example_txt)

# 单条数据的文本编码
print(f'example_text:{example_txt}\n'
      f'example_encode:{example_encode}')

example_text:b'About the pyre a chosen band of Greeks'
example_encode:[6337, 404, 3185, 2771, 8569, 10160, 10474, 13576]


In [9]:
# 抽象成编码函数
def encode(text_tensor,label):
    encode_text = encoder.encode(text_tensor.numpy())
    return encode_text,label

#
def encode_map_fn(text,label):
    # 将Python函数包装成tensorflow的算子
    encode_text,label = tf.py_function(encode,
                                       inp=[text,label], #输入

                                       Tout=(tf.int64,tf.int64)) #输出格式
    # None保证text具有动态形状
    encode_text.set_shape([None])
    label.set_shape([])

    return encode_text,label

# 将数据和标签拼起来
all_encoded_data = all_labeled_data.map(encode_map_fn)
for v in all_encoded_data.take(5):print(v)

(<tf.Tensor: shape=(8,), dtype=int64, numpy=array([ 6337,   404,  3185,  2771,  8569, 10160, 10474, 13576])>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(6,), dtype=int64, numpy=array([15105,  3602, 17124,   477, 17085,  2575])>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(8,), dtype=int64, numpy=array([ 5983,  2000,  2944, 10474, 10136, 14649,  3174,  6228])>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(10,), dtype=int64, numpy=
array([ 4160,  1167,   404, 14013,  6928,  2218, 13912, 13263,   404,
         178])>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(9,), dtype=int64, numpy=array([10184,  5578,  4506, 16840,  9879, 14560,  7904, 11699, 16533])>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [10]:
# skip函数跳过的数据用于做test,之后的用作训练集
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(64)

# 测试集的数据构造
test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(64)

In [11]:
sample_text,sample_labels = next(iter(test_data))
sample_text[0],sample_labels[0]

(<tf.Tensor: shape=(16,), dtype=int64, numpy=
 array([ 6337,   404,  3185,  2771,  8569, 10160, 10474, 13576,     0,
            0,     0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>)

In [12]:
vocab_size += 1  # 用作特殊字符占位

In [19]:
# 构造模型
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64,64]:
    model.add(tf.keras.layers.Dense(units,activation='relu'))
model.add(tf.keras.layers.Dense(3,activation='softmax'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          1099456   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 195       
Total params: 1,178,115
Trainable params: 1,178,115
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.compile(optimizer='adam', #梯度更新优化器
              loss='sparse_categorical_crossentropy', # 损失函数
              metrics = ['accuracy']) # 衡量方式
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          1099456   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 195       
Total params: 1,178,115
Trainable params: 1,178,115
Non-trainable params: 0
_________________________________________________________________


In [15]:
# 训练和验证
model.fit(train_data,epochs=3,validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f99863d9d30>

In [16]:
input = next(iter(test_data))[0]
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,64))
model.compile('rmsprop', 'mse')
output = model.predict(input)