In [1]:
import os
import sys
import random
import pickle
import numpy as np
from tqdm import tqdm
import tensorflow as tf 
from bert4keras.backend import K,keras,search_layer
from bert4keras.snippets import ViterbiDecoder,to_array

from data_load import *
from build_model import bert_bilstm_crf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
# 固定随机种子
seed = 233
tf.set_random_seed(seed)
np.random.seed(seed)
os.environ['PYTHONHSHSEED'] = str(seed)

# 权重参数
epochs = 4
batch_size = 16
lstm_units = 128
drop_rate = 0.1 #有改动0.1-》0.01
learning_rate = 5e-5
max_len =168

#精细训练
fine_train_list = [0 for i in range(8275)]
train_predict_list = []

# 权重路径
config_path = './bert_weight_file/uncased_L-4_H-768_A-12/bert_config.json'
checkpoint_path = './bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt'

# 模型保存路径
model_save_path = './save_model/bert_bilstm_crf.weight'
CRF_save_path = './save_model/CRF.npy'

class NamedEntityRecognizer(ViterbiDecoder):
    """命名实体识别器
    """
    def recognize(self, text):
        tokens = tokenizer.tokenize(text)
        while len(tokens) > max_len:
            tokens.pop(-2)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        token_ids, segment_ids = to_array([token_ids], [segment_ids]) # ndarray
        nodes = model.predict([token_ids, segment_ids])[0] # [sqe_len,23]
        labels = self.decode(nodes) # id [sqe_len,], [0 0 0 0 0 7 8 8 0 0 0 0 0 0 0]
        entities, starting = [], False
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 2]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False
        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]
    
#相等应加set（）中源文本的数量    
def ner_metrics(data,fine_train_list):
    X,Y,Z = 1e-6,1e-6,1e-6
    count = 0
    for d in tqdm(data):
        text = ''.join([i[0] for i in d])
        pred= NER.recognize(text)
        R = set(pred)
        T = set([tuple(i) for i in d if i[1] != 'O'])
        
        # 便于T和R做交集
        m = []
        for i in T:
            for j in i[0]:
                m.append((j,i[1]))
        T = set(m)
        
        # 填充train_predict_list,更新fine_train_list
        if len(T) > 0 :  
            if len(train_predict_list) < 8275:
                train_predict_list.append(R&T)
            else:
                if len(R&T) > fine_train_list[count]:
#                     print('text: ',text)
#                     print('T: ',T)
#                     print('R&T: ',R&T)
                    train_predict_list[count] = R&T
            if len(R&T) > fine_train_list[count]:
                fine_train_list[count] = len(R&T)
            
            
#         if len(T) < fine_train_list[count]:
#             print(False)
#             print('text: ',text)
#             print('T: ',T)
#             print('R&T: ',R&T)
#             print('fine_train_list[count]: ',fine_train_list[count])
#             print()

        X += fine_train_list[count]
        if len(R) < fine_train_list[count]:
            Y += fine_train_list[count]
        else:
            Y += len(R)
            
        Z += len(T)
        count += 1

    f1,precision,recall = 2 * X / (Y + Z),X / Y,X / Z
    return f1,precision,recall

class Evaluator(keras.callbacks.Callback):
    def __init__(self):
        super(Evaluator, self).__init__()
        self.best_val_f1 = 0
    def on_epoch_end(self, epoch,logs=None):
        NER.trans = K.eval(CRF.trans) # 可能有错
        f1, precision, recall = ner_metrics(valid_data,fine_train_list)
        if f1 > self.best_val_f1:
            model.save_weights(model_save_path)
            self.best_val_f1 = f1
            print('save model to {}'.format(checkpoint_path))
        else:
            global learning_rate
            learning_rate = learning_rate / 5
        print(
              'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
              (f1,precision,recall,self.best_val_f1)
        )
        
# def adversarial_training(model, embedding_name, epsilon=1):
#     """
#     给模型添加对抗训练
#     其中model是需要添加对抗训练的keras模型
#     """
#     if model.train_function is None:  # 如果还没有训练函数
#         model._make_train_function()  # 手动make
#     old_train_function = model.train_function  # 备份旧的训练函数

#     # 查找Embedding层
#     for output in model.outputs:
#         embedding_layer = search_layer(output, embedding_name)
#         if embedding_layer is not None:
#             break
#     if embedding_layer is None:
#         raise Exception('Embedding layer not found')

#     # 求Embedding梯度
#     embeddings = embedding_layer.embeddings  # Embedding矩阵
#     gradients = K.gradients(model.total_loss, [embeddings])  # Embedding梯度
#     gradients = K.zeros_like(embeddings) + gradients[0]  # 转为dense tensor

#     # 封装为函数
#     inputs = (
#         model._feed_inputs + model._feed_targets + model._feed_sample_weights
#     )  # 所有输入层
#     embedding_gradients = K.function(
#         inputs=inputs,
#         outputs=[gradients],
#         name='embedding_gradients',
#     )  # 封装为函数

#     def train_function(inputs):
#         # 重新定义训练函数
#         grads = embedding_gradients(inputs)[0]  # Embedding梯度
#         delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8)  # 计算扰动
#         K.set_value(embeddings, K.eval(embeddings) + delta)  # 注入扰动
#         outputs = old_train_function(inputs)  # 梯度下降
#         K.set_value(embeddings, K.eval(embeddings) - delta)  # 删除扰动
#         return outputs
#     model.train_function = train_function  # 覆盖原训练函数        



model,CRF = bert_bilstm_crf(config_path,checkpoint_path,num_labels,lstm_units,drop_rate,learning_rate)
# adversarial_training(model,'Embedding-Token',0.5)
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [3]:
if __name__ == "__main__":
    train_data,_ = load_data('./data/train/train.txt',128)
    valid_data,_ = load_data('./data/test/test.txt',128)
    
    flag = False
    count = 0
    i = 0
    while(i<len(train_data)):
        if flag==True:
            i = i-1
        if train_data[i][0][1] == 'O'and len(train_data[i])==1:
            del train_data[i]
            flag = True
            count+=1
        else:
            for j in range(count):
                train_data.append(train_data[i])
            flag = False
            count = 0
        i += 1
    
    train_generator = data_generator(train_data, batch_size)
    valid_generator = data_generator(valid_data, batch_size*5)
    
    evaluator = Evaluator()
    
    def scheduler(epoch):
        return learning_rate/(max(2*(epoch-1),1))

    lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler)

    
    model.fit(
        train_generator.forfit(),
        steps_per_epoch = len(train_generator),
        validation_data = valid_generator.forfit(),
        validation_steps = len(valid_generator),
        epochs = epochs,
        callbacks = [evaluator,lr_scheduler]
    )
    
    print(K.eval(CRF.trans))
    print(K.eval(CRF.trans).shape)
    model.save_weights(model_save_path)
    np.save(CRF_save_path, K.eval(CRF.trans))

    # torch.save(model, model_save_path)
    # pickle.dump(K.eval(CRF.trans),open('./save_model/crf_trans.pkl','rb'))
    
else:
    # model = torch.load(model_save_path)
    model.load_weights(model_save_path)
    # NER.trans = pickle.load(open('./save_model/crf_trans.pkl','rb'))


Epoch 1/4


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:30<00:00, 39.40it/s]


save model to ./bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt
valid: f1: 0.53504, precision: 0.56155, recall: 0.51091, best f1: 0.53504

Epoch 2/4


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:15<00:00, 42.32it/s]


valid: f1: 0.53315, precision: 0.51467, recall: 0.55300, best f1: 0.53504

Epoch 3/4


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:18<00:00, 41.73it/s]


save model to ./bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt
valid: f1: 0.55507, precision: 0.55009, recall: 0.56014, best f1: 0.55507

Epoch 4/4


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:19<00:00, 41.57it/s]


save model to ./bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt
valid: f1: 0.55734, precision: 0.55372, recall: 0.56099, best f1: 0.55734

[[ 0.4287106  -0.86728936  0.1629885   0.22899823]
 [-0.33825907 -0.543957   -0.55285895 -0.7845749 ]
 [ 0.17082854 -0.9606606  -0.27878678 -0.5104652 ]
 [-0.8264545  -0.555815    0.37197393  0.81980103]]
(4, 4)


In [14]:
if __name__ == "__main__":
    train_data,_ = load_data('./data/train/train.txt',128)
    valid_data,_ = load_data('./data/test/test.txt',128)
    

    
    train_generator = data_generator(train_data, batch_size)
    valid_generator = data_generator(valid_data, batch_size*5)
    

In [15]:
print(train_data[0:10])
# print(train_predict_list[0:10])

[[['医生：你好我是您的接诊医生', 'O']], [['医生：宝贝最近吃奶量可以吗？下降了吗', 'O']], [['患者：没有，也没怎么', 'O'], ['哭闹', 'Symptom-0']], [['医生：宝妈有没有吃生冷辛辣刺激食物油腻食物来吗？', 'O']], [['医生：宝贝奶粉的话最近换过牌子吗？', 'O']], [['医生：宝贝肚子着凉来吗？', 'O']], [['患者：喝茶油腻也少，菜吃很多', 'O']], [['医生：嗯嗯，宝妈饮食一定注意，生冷辛辣刺激食物不能吃油腻食物不能吃，特别油腻食物的奥，清淡饮食为主，这个时候宝贝胃肠功能可能会有影响，能吃多少吃多少别强喂的奥！', 'O']], [['医生：宝贝最近有没有', 'O'], ['呕吐', 'Symptom-1'], ['症状呢？', 'O']], [['患者：', 'O'], ['呕吐', 'Symptom-1'], ['，有时会', 'O'], ['吐', 'Symptom-1'], ['，不多', 'O']]]


In [4]:
import numpy as np
# 保存矩阵
fine=np.array(fine_train_list)
tpl = np.array(train_predict_list)
np.save('./fine_train_list.npy',fine)
np.save('./train_predict_list.npy',tpl)

In [5]:
model.save_weights(model_save_path)

In [3]:
# 下载矩阵
fine = np.load('./fine_train_list.npy')
fine_train_list = fine.tolist()
tpl = np.load('./train_predict_list.npy',allow_pickle=True)
train_predict_list = tpl.tolist()

In [4]:
model.load_weights(model_save_path)

In [7]:
if __name__ == "__main__":
    epochs = 3
    
    train_data,_ = load_data('./data/train/train.txt',128)
    valid_data,_ = load_data('./data/test/test.txt',128)
    

    train_generator = data_generator(train_data, batch_size)
    valid_generator = data_generator(valid_data, batch_size*5)
    
    checkpoint = keras.callbacks.ModelCheckpoint(
        model_save_path,
        monitor = 'val_sparse_accuracy',
        verbose = 1,
        save_best_only = True,
        mode = 'max'
    )
    evaluator = Evaluator()
    
#     def scheduler(epoch):
#         return learning_rate/(max(2*(epoch-1),1))

#     lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler)

    
    model.fit(
        train_generator.forfit(),
        steps_per_epoch = len(train_generator),
        validation_data = valid_generator.forfit(),
        validation_steps = len(valid_generator),
        epochs = epochs,
        callbacks = [evaluator]
    )
    
    print(K.eval(CRF.trans))
    print(K.eval(CRF.trans).shape)
    model.save_weights(model_save_path)
    np.save(CRF_save_path, K.eval(CRF.trans))

    # torch.save(model, model_save_path)
    # pickle.dump(K.eval(CRF.trans),open('./save_model/crf_trans.pkl','rb'))
    
else:
    # model = torch.load(model_save_path)
    model.load_weights(model_save_path)
    # NER.trans = pickle.load(open('./save_model/crf_trans.pkl','rb'))

Epoch 1/3


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:33<00:00, 38.69it/s]


save model to ./bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt
valid: f1: 0.58730, precision: 0.60555, recall: 0.57012, best f1: 0.58730

Epoch 2/3


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:37<00:00, 38.05it/s]


save model to ./bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt
valid: f1: 0.59419, precision: 0.61280, recall: 0.57669, best f1: 0.59419

Epoch 3/3


100%|██████████████████████████████████████████████████████████████████████████████| 8275/8275 [03:35<00:00, 38.45it/s]


save model to ./bert_weight_file/uncased_L-4_H-768_A-12/bert_model.ckpt
valid: f1: 0.59558, precision: 0.60751, recall: 0.58411, best f1: 0.59558

[[ 0.20985383 -1.045513   -0.256569   -0.10786816]
 [-0.5771153  -0.427747   -0.9554797  -1.1635962 ]
 [-0.2451467  -1.3626031  -0.6572353  -0.9109553 ]
 [-1.064843   -0.96952844 -0.03134646  0.8583275 ]]
(4, 4)
