In [2]:
# 自动计算cell的计算时间
%load_ext autotime

%matplotlib inline
%config InlineBackend.figure_format='svg' #矢量图设置，让绘图更清晰

time: 435 ms (started: 2021-08-27 17:57:15 +08:00)


In [3]:
%%bash

# 增加更新
git add *.ipynb *.md

git remote -v

git commit -m '更新 #5 Aug 27, 2021'

#git push origin master
git push

origin	git@github.com:ustchope/keras_examples.git (fetch)
origin	git@github.com:ustchope/keras_examples.git (push)
[main fe233a4] 更新 #4 Aug 27, 2021
 3 files changed, 116 insertions(+), 107 deletions(-)
 create mode 100644 "\344\275\277\347\224\250 Transformer \350\277\233\350\241\214\346\226\207\346\234\254\345\210\206\347\261\273.ipynb"
 create mode 100644 "\346\234\252\345\221\275\345\220\215.ipynb"


To git@github.com:ustchope/keras_examples.git
   0d17081..fe233a4  main -> main


time: 3.98 s (started: 2021-08-27 17:57:37 +08:00)


In [4]:
#设置使用的gpu
import tensorflow as tf
from tensorflow import keras

gpus = tf.config.list_physical_devices("GPU")

if gpus:
   
    gpu0 = gpus[0] #如果有多个GPU，仅使用第0个GPU
    tf.config.experimental.set_memory_growth(gpu0, True) #设置GPU显存用量按需使用
    # 或者也可以设置GPU显存为固定使用量(例如：4G)
    #tf.config.experimental.set_virtual_device_configuration(gpu0,
    #    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]) 
    tf.config.set_visible_devices([gpu0],"GPU")

time: 2.82 s (started: 2021-08-27 17:57:57 +08:00)


# 设置

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

time: 639 ms (started: 2021-08-27 17:59:01 +08:00)


# 将 Transformer 块实现为层

In [6]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

time: 2.39 ms (started: 2021-08-27 18:02:16 +08:00)


# 实现嵌入层
两个独立的嵌入层，一个用于标记，一个用于标记索引（位置）。

In [7]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

time: 1.57 ms (started: 2021-08-27 18:03:26 +08:00)


# 下载并准备数据集

In [8]:
vocab_size = 20000  # 只考虑前 20k 个单词
maxlen = 200  # 只考虑每篇影评的前 200 字
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

25000 Training sequences
25000 Validation sequences
time: 4.32 s (started: 2021-08-27 18:04:25 +08:00)


# 使用Transformer层创建分类器模型
Transformer 层为我们输入序列的每个时间步输出一个向量。 在这里，我们采用所有时间步长的平均值，并在其上使用前馈网络对文本进行分类。

In [9]:
embed_dim = 32  # 每个令牌的嵌入大小
num_heads = 2  # 注意力头数
ff_dim = 32  # transformer内部前馈网络中的隐藏层大小 

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

time: 1.61 s (started: 2021-08-27 18:07:21 +08:00)


In [11]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
token_and_position_embedding (None, 200, 32)           646400    
_________________________________________________________________
transformer_block (Transform (None, 200, 32)           10656     
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                660       
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0     

# 训练和评估

In [10]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)

Epoch 1/2
Epoch 2/2
time: 1min 12s (started: 2021-08-27 18:13:53 +08:00)
