In [1]:
#第11章/加载编码器
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3')

tokenizer

PreTrainedTokenizerFast(name_or_path='hfl/rbt3', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [2]:
#第11章/编码测试
out = tokenizer.batch_encode_plus(
    [[
        '海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间',
        '的', '海', '域', '。'
    ],
     [
         '这', '座', '依', '山', '傍', '水', '的', '博', '物', '馆', '由', '国', '内', '一',
         '流', '的', '设', '计', '师', '主', '持', '设', '计', '。'
     ]],
    truncation=True,
    padding=True,
    return_tensors='tf',
    max_length=20,
    is_split_into_words=True)

#还原编码为句子
print(tokenizer.decode(out['input_ids'][0]))
print(tokenizer.decode(out['input_ids'][1]))

for k, v in out.items():
    print(k, v)

[CLS] 海 钓 比 赛 地 点 在 厦 门 与 金 门 之 间 的 海 域 。 [SEP]
[CLS] 这 座 依 山 傍 水 的 博 物 馆 由 国 内 一 流 的 设 计 [SEP]
input_ids tf.Tensor(
[[ 101 3862 7157 3683 6612 1765 4157 1762 1336 7305  680 7032 7305  722
  7313 4638 3862 1818  511  102]
 [ 101 6821 2429  898 2255  988 3717 4638 1300 4289 7667 4507 1744 1079
   671 3837 4638 6392 6369  102]], shape=(2, 20), dtype=int32)
token_type_ids tf.Tensor(
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(2, 20), dtype=int32)
attention_mask tf.Tensor(
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]], shape=(2, 20), dtype=int32)


In [3]:
#第11章/获取数据集
from datasets import load_dataset, load_from_disk


def get_dataset(split):
    #在线加载数据集
    #dataset = load_dataset(path='peoples_daily_ner', split=split)

    #离线加载数据集
    dataset = load_from_disk(dataset_path='./data/peoples_daily_ner')[split]

    #打乱顺序
    dataset = dataset.shuffle()

    #dataset.features['ner_tags'].feature.num_classes
    #7

    #dataset.features['ner_tags'].feature.names
    #['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

    return dataset


dataset = get_dataset('train')

dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 20865
})

In [4]:
#第11章/定义数据加载函数
import tensorflow as tf


def get_batch_data(dataset, idx, batch_size):
    idx_from = idx * batch_size
    idx_to = idx_from + batch_size

    if idx_to > dataset.num_rows:
        return None, None

    data = dataset[idx_from:idx_to]

    #编码数据
    inputs = tokenizer.batch_encode_plus(data['tokens'],
                                         truncation=True,
                                         padding=True,
                                         return_tensors='tf',
                                         max_length=512,
                                         is_split_into_words=True)

    labels = data['ner_tags']

    #求一批数据中最长的句子长度
    lens = inputs['input_ids'].shape[1]

    #在labels的头尾补充7，把所有的labels补充成统一的长度
    for i in range(len(labels)):
        labels[i] = [7] + labels[i]
        labels[i] += [7] * lens
        labels[i] = labels[i][:lens]

    labels = tf.constant(labels, dtype=tf.int32)

    return inputs, labels

In [5]:
#第11章/查看数据样例
inputs, labels = get_batch_data(dataset, 0, 16)

for k, v in inputs.items():
    print(k, v.shape)

print('labels', labels.shape)

input_ids (16, 112)
token_type_ids (16, 112)
attention_mask (16, 112)
labels (16, 112)


In [6]:
#第11章/加载预训练模型
from transformers import TFAutoModel

pretrained = TFAutoModel.from_pretrained('hfl/rbt3')

#查看模型概述
pretrained.summary()

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at hfl/rbt3.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  38476800  
Total params: 38,476,800
Trainable params: 38,476,800
Non-trainable params: 0
_________________________________________________________________


In [7]:
#第11章/模型试算
#[b, lens] -> [b, lens, 768]
pretrained(**inputs).last_hidden_state.shape

TensorShape([16, 112, 768])

In [8]:
#第11章/定义下游模型
class Model(tf.keras.Model):
    def __init__(self):
        super().__init__()

        #标识当前模型是否处于tuning模式
        self.tuning = False
        #当处于tuning模式时backbone应该属于当前模型的一部分，否则该变量为空
        self.pretrained = None

        #当前模型的神经网络层
        self.rnn = tf.keras.layers.GRU(units=768, return_sequences=True)
        self.fc = tf.keras.layers.Dense(units=8, activation=tf.nn.softmax)

    def call(self, inputs):
        #根据当前模型是否处于tuning模式而使用外部backbone或内部backbone计算
        if self.tuning:
            out = self.pretrained(**inputs).last_hidden_state
        else:
            out = pretrained(**inputs).last_hidden_state

        #backbone抽取的特征输入rnn网络进一步抽取特征
        out = self.rnn(out)

        #rnn网络抽取的特征最后输入fc神经网络分类
        out = self.fc(out)

        return out

    #切换下游任务模型的的tuning模式
    def fine_tuning(self, tuning):
        self.tuning = tuning
        #tuning模式时，训练backbone的参数
        if tuning:
            self.pretrained = pretrained
        #非tuning模式时，不训练backbone的参数
        else:
            self.pretrained = None


model = Model()

model(inputs).shape

TensorShape([16, 112, 8])

In [9]:
#第11章/对计算结果和label变形,并且移除pad
def reshape_and_remove_pad(outs, labels, attention_mask):
    #变形,便于计算loss
    #[b, lens, 8] -> [b*lens, 8]
    #[b, lens] -> [b*lens]
    outs = tf.reshape(outs, [-1, 8])
    labels = tf.reshape(labels, [-1])

    #忽略对pad的计算结果
    #[b, lens] -> [b*lens - pad]
    select = tf.reshape(attention_mask, [-1]) == 1
    outs = outs[select]
    labels = labels[select]

    return outs, labels


reshape_and_remove_pad(tf.random.normal([2, 3, 8]), tf.ones([2, 3]),
                       tf.ones([2, 3]))

(<tf.Tensor: shape=(6, 8), dtype=float32, numpy=
 array([[ 1.3714149e+00, -1.5095697e-04, -7.1241909e-01, -8.6933529e-01,
         -5.9476590e-01, -8.1611431e-01, -2.4342534e-01,  1.1834431e+00],
        [-6.7946464e-01, -8.9938432e-01,  1.8360443e-01,  1.0472615e+00,
         -6.2940359e-01, -2.0146336e-01,  6.4231116e-01, -3.0778365e-02],
        [ 9.2144525e-01, -3.3907911e-01, -9.2427254e-01,  1.5306507e+00,
         -9.7463369e-01,  1.0550396e+00,  7.0500964e-01,  7.7894813e-01],
        [ 4.3940777e-01,  4.8754442e-01, -5.4782212e-01,  7.3784602e-01,
          5.3179342e-01,  5.2382642e-01, -9.4239825e-01,  5.2462798e-01],
        [-6.3888496e-01, -7.4003971e-01,  8.7833917e-01,  9.1179144e-01,
          9.7684518e-02, -1.0721865e+00,  1.2910798e+00, -4.8718882e-01],
        [ 4.3508747e-01, -1.5385412e+00,  6.7557418e-01, -1.6573043e+00,
         -1.3294674e+00,  4.0037945e-01, -5.7826185e-01, -8.4279853e-01]],
       dtype=float32)>,
 <tf.Tensor: shape=(6,), dtype=float32, nump

In [10]:
#第11章/获取正确数量和总数
def get_correct_and_total_count(outs, labels):
    #[b*lens, 8] -> [b*lens]
    outs = tf.argmax(outs, axis=1, output_type=tf.int32)

    correct = tf.cast(outs == labels, dtype=tf.int32)
    correct = int(tf.reduce_sum(correct))

    total = len(labels)

    #计算除了0以外元素的正确率,因为0太多了,包括的话,正确率很容易虚高
    select = labels != 0
    outs = outs[select]
    labels = labels[select]

    correct_content = tf.cast(outs == labels, dtype=tf.int32)
    correct_content = int(tf.reduce_sum(correct_content))

    total_content = len(labels)

    return correct, total, correct_content, total_content


get_correct_and_total_count(tf.random.normal([16, 8]),
                            tf.ones([16], dtype=tf.int32))

(2, 16, 2, 16)

In [11]:
#第11章/训练
from transformers import create_optimizer


def train(epochs):
    #创建优化器和学习率衰减工具
    optimizer, schedule = create_optimizer(
        #如果模型是tuning模式则使用更小的学习率
        init_lr=2e-5 if model.tuning else 5e-4,
        num_warmup_steps=0,
        #统计总steps
        num_train_steps=(dataset.num_rows // 16) * epochs)

    for epoch in range(epochs):
        i = 0
        while True:
            #取1个批次的数据
            inputs, labels = get_batch_data(dataset, i, 16)
            #如果没有取到数据，则说明数据已经遍历结束
            if inputs == None:
                break

            #记录梯度变化
            with tf.GradientTape() as tape:
                #模型计算
                #[b, lens] -> [b, lens, 8]
                outs = model(inputs)

                #对outs和label变形,并且移除pad
                #outs -> [b, lens, 8] -> [c, 8]
                #labels -> [b, lens] -> [c]
                outs, labels = reshape_and_remove_pad(outs, labels,
                                                      inputs['attention_mask'])

                #计算loss
                loss = tf.losses.categorical_crossentropy(
                    y_true=tf.one_hot(labels, depth=8),
                    y_pred=outs,
                    from_logits=False,
                    axis=1,
                )
                loss = tf.reduce_mean(loss)

            #根据loss计算参数梯度
            grads = tape.gradient(loss, model.trainable_variables)

            #根据梯度更新参数
            optimizer.apply_gradients(
                (grad, var)
                for (grad, var) in zip(grads, model.trainable_variables)
                if grad is not None)

            #衰减学习率
            schedule(1)

            if i % 50 == 0:
                counts = get_correct_and_total_count(outs, labels)
                accuracy = counts[0] / counts[1]
                accuracy_content = counts[2] / counts[3]
                lr = float(optimizer.lr(optimizer.iterations))

                print(epoch, i, float(loss), lr, accuracy, accuracy_content)

            i += 1

        #保存模型参数
        model.save_weights('model/tf_parameters/中文命名实体识别')

In [12]:
#第11章/两段式训练第1步，训练下游任务模型
model.fine_tuning(False)
print(sum([int(tf.size(i)) for i in model.trainable_variables]) / 10000)
#train(1)

354.9704


In [13]:
#第11章/两段式训练第2步，同时训练下游任务模型和预训练模型
model.fine_tuning(True)
print(sum([int(tf.size(i)) for i in model.trainable_variables]) / 10000)
#train(2)

4202.6504


In [14]:
#第11章/测试
def test():
    #加载训练完的模型参数
    model.load_weights('model/tf_parameters/中文命名实体识别')

    #测试数据集
    dataset_test = get_dataset('test')

    correct = 0
    total = 0

    correct_content = 0
    total_content = 0

    #测试5个批次即可
    for i in range(5):
        print(i)
        inputs, labels = get_batch_data(dataset_test, i, 128)

        #计算
        #[b, lens] -> [b, lens, 8] -> [b, lens]
        outs = model(inputs)

        #对outs和label变形,并且移除pad
        #outs -> [b, lens, 8] -> [c, 8]
        #labels -> [b, lens] -> [c]
        outs, labels = reshape_and_remove_pad(outs, labels,
                                              inputs['attention_mask'])

        #统计正确数量
        counts = get_correct_and_total_count(outs, labels)
        correct += counts[0]
        total += counts[1]
        correct_content += counts[2]
        total_content += counts[3]

    print(correct / total, correct_content / total_content)


test()

0
1
2
3
4
0.9889047294365586 0.9555555555555556


In [15]:
#第11章/预测
def predict():
    #加载训练完的模型参数
    model.load_weights('model/tf_parameters/中文命名实体识别')

    #测试数据集
    dataset_test = get_dataset('test')

    #取一个批次的数据
    inputs, labels = get_batch_data(dataset_test, 0, 32)

    #计算
    #[b, lens] -> [b, lens, 8] -> [b, lens]
    outs = model(inputs)
    outs = tf.argmax(outs, axis=2, output_type=tf.int32)

    for i in range(32):
        #移除pad
        select = inputs['attention_mask'][i] == 1

        input_id = tf.boolean_mask(inputs['input_ids'][i], axis=0, mask=select)
        out = tf.boolean_mask(outs[i], axis=0, mask=select)
        label = tf.boolean_mask(labels[i], axis=0, mask=select)

        #输出原句子
        print(tokenizer.decode(input_id).replace(' ', ''))

        #输出tag
        for tag in [label, out]:
            s = ''
            for j in range(len(tag)):
                if tag[j] == 0:
                    s += '·'
                    continue
                s += tokenizer.decode(input_id[j])
                s += str(int(tag[j]))

            print(s)
        print('==========================')


predict()

[CLS]当这一切辛勤劳作的成果不被承认或竟被剥夺，他便用笔控诉这种剥夺，于是有了他诗中的那种抗争的激情。[SEP]
[CLS]7················································[SEP]7
[CLS]7················································[SEP]7
[CLS]本报讯6月20日，红双喜中国乒乓球俱乐部甲级联赛大战7场，掀起一个小高潮。[SEP]
[CLS]7·········红3双4喜4中5国6·······················[SEP]7
[CLS]7·········红3双4·中5国4·······················[SEP]7
[CLS]粉煤灰用于煤气生产密封效果好本报讯黑龙江省牡丹江市煤气公司把粉煤灰和粘土按一定比例配制成焦炉密封材料，在国内同行业中开了先河。[SEP]
[CLS]7·················黑5龙6江6省6牡3丹4江4市4煤4气4公4司4··································[SEP]7
[CLS]7·················黑5龙6江6省6牡3丹4江4市4煤4气4公4司4··································[SEP]7
[CLS]求同存异：抗日战争时期，在同国民党的统一战线中，国民党许多不利于团结和动员人民抗战的错误政策，只要不是根本危及团结抗战的急迫问题，可以暂时求同存异，加以等待。[SEP]
[CLS]7······日5·······国3民4党4·······国3民4党4····················································[SEP]7
[CLS]7······日5·······国3民4党4·······国3民4党4····················································[SEP]7
[CLS]马克思主义的历史观并不是由什么人发明出来，而后从外部强加给历史的僵化的原则；它本身正是从无数的历史现象中抽象出来的对于历史发展的规律性的认识。[SEP]
[CLS]7马1克2思2······························