In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

In [2]:
print("Tensorflow version: {}".format(tf.version.VERSION))
print("Eager mode: {}".format(tf.executing_eagerly()))
print("Hub version: {}".format(hub.__version__))
print("GPU is {}".format("available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE"))

Tensorflow version: 2.0.0
Eager mode: True
Hub version: 0.7.0
GPU is available


In [3]:
# 完整的数据按照5:5的比例进行切割得到训练集(train)和测试集(test)
# 将训练集按照 6:4 的比例进行切割得到新的训练集(train)和验证集(validation)
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

In [4]:
(train_data, validation_data), test_data = tfds.load(
    name='imdb_reviews', # 网络电影数据库(Internet Movie Database)
    split=(train_validation_split, tfds.Split.TEST),
    as_supervised=True
)

In [5]:
# 查看数据
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch # <shape=(10,), numpy=array(...)>
train_labels_batch # <shape=(10,), numpy=array([1, 1, 1,....0])

<tf.Tensor: id=220, shape=(10,), dtype=int64, numpy=array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0], dtype=int64)>

In [6]:
# 构建模型
# 迁移学习来处理预处理文本(预训练文本嵌入 text embedding)

embedding = "https://hub.tensorflow.google.cn/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding,
                          input_shape=[],
                          dtype=tf.string,
                          trainable=True)

In [7]:
hub_layer(train_examples_batch[:3])

<tf.Tensor: id=402, shape=(3, 20), dtype=float32, numpy=
array([[ 3.9819887 , -4.4838037 ,  5.177359  , -2.3643482 , -3.2938678 ,
        -3.5364532 , -2.4786978 ,  2.5525482 ,  6.688532  , -2.3076782 ,
        -1.9807833 ,  1.1315885 , -3.0339816 , -0.7604128 , -5.743445  ,
         3.4242578 ,  4.790099  , -4.03061   , -5.992149  , -1.7297493 ],
       [ 3.4232912 , -4.230874  ,  4.1488533 , -0.29553518, -6.802391  ,
        -2.5163853 , -4.4002395 ,  1.905792  ,  4.7512794 , -0.40538004,
        -4.3401685 ,  1.0361497 ,  0.9744097 ,  0.71507156, -6.2657013 ,
         0.16533905,  4.560262  , -1.3106939 , -3.1121316 , -2.1338716 ],
       [ 3.8508697 , -5.003031  ,  4.8700504 , -0.04324996, -5.893603  ,
        -5.2983093 , -4.004676  ,  4.1236343 ,  6.267754  ,  0.11632943,
        -3.5934832 ,  0.8023905 ,  0.56146765,  0.9192484 , -7.3066816 ,
         2.8202746 ,  6.2000837 , -3.5709393 , -4.564525  , -2.305622  ]],
      dtype=float32)>

In [8]:
# 构建完整的模型
model = tf.keras.Sequential([
    # Tensorflow Hub 层
    # 这一层使用一个预训练的保存好的模型来将句子映射为嵌入向量（embedding vector）。
    # 我们所使用的预训练文本嵌入（embedding）模型(google/tf2-preview/gnews-swivel-20dim/1)将句子切割为符号，
    # 嵌入（embed）每个符号然后进行合并。
    # 最终得到的维度是：(num_examples, embedding_dimension)
    hub_layer,
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.binary_crossentropy,
    metrics=['accuracy']
)

In [10]:
# 训练模型
# 以 512 个样本的 mini-batch 大小迭代 20 个 epoch 来训练模型。
# 这是指对 x_train 和 y_train 张量中所有样本的的 20 次迭代。
# 在训练过程中，监测来自验证集的 10,000 个样本上的损失值（loss）和准确率（accuracy）
history = model.fit(
    train_data.shuffle(10000).batch(512),
    epochs=20,
    validation_data=validation_data.batch(512),
    verbose=1
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
# 评估模型
results = model.evaluate(test_data.batch(512), verbose=2)

49/49 - 2s - loss: 0.3297 - accuracy: 0.8566


In [13]:
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

loss: 0.330
accuracy: 0.857
