In [1]:
#! -*- coding: utf-8 -*-
import json
from tqdm import tqdm
import os, re
import numpy as np
import pandas as pd
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import codecs

from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam

#BERT的相关参数
mode = 0
maxlen = 300
learning_rate = 5e-5
min_learning_rate = 1e-6

config_path = '/home/notebook/data/group/otext/bert_model/bert_config.json'
checkpoint_path = '/home/notebook/data/group/otext/bert_model/bert_model.ckpt'
dict_path = '/home/notebook/data/group/otext/bert_model/vocab.txt'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
token_dict = {}

# 加载词表
with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)


class OurTokenizer(Tokenizer):
    # 定制化分词器，这里不论中文还是英文都根据单个字符进行切分
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

# 构造分词器实例
tokenizer = OurTokenizer(token_dict)

def seq_padding(X, padding=0):
    # 填充补0
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])    

def list_find(list1, list2):
    # 在list1中查找子串list2，如果找到返回初始的下标，否则返回-1
    n_list2 = len(list2)
    for i in range(len(list1)):
        if list1[i: i+n_list2] == list2:
            return i
    return -1

In [23]:
# 获取训练集
#训练集字段介绍
#id代表唯一数据标识
#title和text是用于识别的文本，可能为空
#unknownEntities代表实体，可能有多个，通过英文";"分隔
train_data = pd.read_csv('./Train_Data.csv').fillna('>>>>>')
train_data = train_data[~train_data['unknownEntities'].isnull()].reset_index(drop = True)
train_data.head(3)

Unnamed: 0,id,title,text,unknownEntities
0,83dcefb7,揭秘趣步骗局，趣步是什么，趣步是怎么赚钱的？趣步公司可靠吗？趣步合法吗？相信是众多小伙伴最关...,揭秘趣步骗局，趣步是什么，趣步是怎么赚钱的？趣步公司可靠吗？趣步合法吗？相信是众多小伙伴最关...,趣步
1,1ad5be0d,企业纳税贷额度，全国小微企业都可做！,{IMG:1}{IMG:2}公司张总说：“没想到缴税还能办贷款，本来我们还在为准备纳税证明、...,西部助贷
2,6dd28e9b,一线|新控股股东入主后联讯证券拟改名为知识城证券,腾讯新闻《》作者刘鹏成功接下47.24%股权入主具有30年历史的联讯证券后，广州开发区金融控...,广州开发区金融控股集团有限公司


In [25]:
# 将title和text合并成content字段，将模型转化成单输入问题
# 如果title和text字段相等那么就合并，否则返回其中一个就行了
train_data['content'] = train_data.apply(lambda x: x['title'] if x['title']==x['text'] else x['title']+x['text'], axis = 1)

# 对于unknownEntities字段中存在多个实体的只使用第一个实体
train_data['unknownEntity'] = train_data['unknownEntities'].apply(lambda x:x.split(';')[0])

# 获取所有的实体类别
# 这里先将unknownEntities进行拼接，然后根据";"切分
entity_str = ''
for i in train_data['unknownEntities'].unique():
    entity_str = i + ';' + entity_str  
    
entity_classes_full = set(entity_str[:-1].split(";"))
# 3183
len(entity_classes_full)

# 训练集变成了两个字段：
# 需要识别的文本content，这是原始数据集中title和text合并之后的数据
# 未知实体列表unknownEntities，类似于label，只会有一个实体
train_data_list = []
for content,entity in zip(train_data['content'], train_data['unknownEntity']):
    train_data_list.append((content, entity))
    
# 根据9:1划分训练集和验证集    
random_order = np.arange(len(train_data_list))
train_list = [train_data_list[j] for i, j in enumerate(random_order) if i % 9 != mode]
dev_list = [train_data_list[j] for i, j in enumerate(random_order) if i % 9 == mode]
print(len(train_list), len(dev_list))

4456 557


In [7]:
# 准备测试集数据
test_data = pd.read_csv('./Test_Data.csv').fillna('>>>>>')
test_data['content'] = test_data.apply(lambda x: x['title'] if x['title']==x['text'] else x['title']+x['text'], axis = 1)


# 测试集变成了两个字段：
# 控制数据唯一性的id
# 需要识别的文本content，这是原始数据集中title和text合并之后的数据
test_data_list = []
for id,content in zip(test_data['id'], test_data['content']):
    test_data_list.append((id, content))

In [8]:
# 找到训练集中content字段中文、英文和数字以外的特殊字符
additional_chars = set()
for data in train_data_list:
    additional_chars.update(re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', data[1]))
additional_chars    

{' ',
 '&',
 '(',
 ')',
 '+',
 '-',
 '.',
 '/',
 '>',
 '?',
 '·',
 '“',
 '”',
 '（',
 '）',
 '：'}

In [9]:
class train_data_generator:
    """
    训练集数据生成器
    """
    def __init__(self, train_list, batch_size=32):
        self.train_list = train_list
        self.batch_size = batch_size
        self.steps = len(self.train_list) // self.batch_size
        if len(self.train_list) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            # 返回训练数据集中的索引列表
            idxs = np.arange(len(self.train_list))
            np.random.shuffle(idxs)
            X1, X2, S1, S2 = [], [], [], []
            for i in idxs:
                train = self.train_list[i]
                # 这里对于超长的文本只会取前510个字符
                # 业界还有一种取头和取尾的方法，思想主要是一篇文章中头部和尾部的内容重要性更高
                # head + tail ： 选择前128个 token 和最后382个 token
                # text= train[0][:128] + train[0][382:]
                text= train[0][:maxlen]
                
                tokens = tokenizer.tokenize(text)
                # entity代表实体
                entity = train[1]
                
                # 获取实体的字符,因为首尾是cls和sep，所以取[1:-1]
                e_tokens = tokenizer.tokenize(entity)[1:-1]
                entity_left_np, entity_right_np = np.zeros(len(tokens)), np.zeros(len(tokens))
                
                # 返回e_tokens实体在tokenszi字符串中的起始位置
                start = list_find(tokens, e_tokens)
                if start != -1:
                    end = start + len(e_tokens) - 1
                    entity_left_np[start] = 1
                    entity_right_np[end] = 1
                    
                    # x1是词编码，x2是句子对关系编码
                    word_embedding, seg_embedding = tokenizer.encode(first=text)
                    X1.append(word_embedding)
                    X2.append(seg_embedding)
                    
                    # 对于文本分类来说，S1和S2代表标签
                    # 这里命名体识别任务S1、S2代表文本中的实体左右边界
                    # 比如 tokens=['[CLS]', '傻', '大', '姐', '借', '口', '给', '二', '妹', '送', '钱', 'love', '[SEP]'] 
                    # e_tokens = ['二', '妹']
                    # word_embedding 是text编码得到的词编码   [101, 1004, 1920, 1995, 955, 1366, 5314, 753, 1987, 6843, 7178, 8451, 102]
                    # seg_embedding 是text编码得到的句子对编码   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                    # s1是数组中实体开始位置为1其他均为0 [array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])]
                    # s2是数组中实体结束为只为1其他均为0 [array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])]                    
                    S1.append(entity_left_np)
                    S2.append(entity_right_np)
                    if len(X1) == self.batch_size or i == idxs[-1]:
                        X1 = seq_padding(X1)
                        X2 = seq_padding(X2)
                        S1 = seq_padding(S1)
                        S2 = seq_padding(S2)
                        yield [X1, X2, S1, S2], None
                        X1, X2, S1, S2 = [], [], [], []

In [19]:
# 构建训练模型
# 整个模型是单输入和单输出的问题
# 模型输入是一条query文本，这里会先将文本转换成三层embedding，token embedding、seg embedding和position embedding
# 因为句子关系可以直接获取，所以只返回token embedding、seg embedding两个输入，作为网络的输入
# 模型输出是一个实体，这个实体是query中的一个子片段
#根据这个输出特性，输出应该用指针结构，通过两个Softmax分别预测首尾，然后得到一个实体
# 所以这里返回实体的左边界和右边界作为网络的输出

# 导入预训练模型
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

# 是否进行微调
for layer in bert_model.layers:
    layer.trainable = True

# 词编码输入
word_in = Input(shape=(None,), name='word_in') 
# 句子对编码输入
seg_in = Input(shape=(None,), name='seg_in')
# 实体左边界数组，只有实体开始位置为1，其他均为0
entiry_left_in = Input(shape=(None,), name='entiry_left_in')
# 实体右边界数组，只有实体结束位置为1，其他均为0
entiry_right_in = Input(shape=(None,), name='entiry_right_in')

x1, x2, s1, s2 = word_in, seg_in, entiry_left_in, entiry_right_in

bert_in = bert_model([word_in, seg_in])
ps1 = Dense(1, use_bias=False, name='ps1')(bert_in)
# 遮掩掉不应该读取到的信息，或者无用的信息，以0作为mask的标记
x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'), name='x_mask')(word_in)
ps2 = Dense(1, use_bias=False, name='ps2')(bert_in)
ps11 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10, name='ps11')([ps1, x_mask])
ps22 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10, name='ps22')([ps2, x_mask])

train_model = Model([word_in, seg_in, entiry_left_in, entiry_right_in], [ps11, ps22])

# 构建模型
build_model = Model([word_in, seg_in], [ps11, ps22])

loss1 = K.mean(K.categorical_crossentropy(entiry_left_in, ps11, from_logits=True))
ps22 -= (1 - K.cumsum(s1, 1)) * 1e10
loss2 = K.mean(K.categorical_crossentropy(entiry_right_in, ps22, from_logits=True))
loss = loss1 + loss2

train_model.add_loss(loss)
train_model.compile(optimizer=Adam(learning_rate))
train_model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_in (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
seg_in (InputLayer)             (None, None)         0                                            
__________________________________________________________________________________________________
model_8 (Model)                 (None, None, 768)    101677056   word_in[0][0]                    
                                                                 seg_in[0][0]                     
__________________________________________________________________________________________________
ps1 (Dense)                     (None, None, 1)      768         model_8[1][0]              

  'be expecting any data to be passed to {0}.'.format(name))
  'be expecting any data to be passed to {0}.'.format(name))


In [11]:
# 经过一个softmax操作
def softmax(x):
    x = x - np.max(x)
    x = np.exp(x)
    return x / np.sum(x)
softmax([1, 9, 5, 3])

# 抽取实体
# 输入用户搜索query
# 输出实体
def extract_entity(text_in):
    text_in = text_in[:maxlen]
    _tokens = tokenizer.tokenize(text_in)
    _x1, _x2 = tokenizer.encode(first=text_in)
    _x1, _x2 = np.array([_x1]), np.array([_x2])
    _ps1, _ps2  = build_model.predict([_x1, _x2])
    _ps1, _ps2 = softmax(_ps1[0]), softmax(_ps2[0])
    for i, _t in enumerate(_tokens):
        if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
            _ps1[i] -= 10
    start = _ps1.argmax()
    for end in range(start, len(_tokens)):
        _t = _tokens[end]
        if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
            break
    end = _ps2[start:end+1].argmax() + start
    a = text_in[start-1: end]
    return a

class Evaluate(Callback):
    """构建自定义评估期"""
    def __init__(self):
        self.ACC = []
        self.best = 0.
        self.passed = 0
    def on_batch_begin(self, batch, logs=None):
        """第一个epoch用来warmup，第二个epoch把学习率降到最低
        """
        if self.passed < self.params['steps']:
            lr = (self.passed + 1.) / self.params['steps'] * learning_rate
            K.set_value(self.model.optimizer.lr, lr)
            self.passed += 1
        elif self.params['steps'] <= self.passed < self.params['steps'] * 2:
            lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate)
            lr += min_learning_rate
            K.set_value(self.model.optimizer.lr, lr)
            self.passed += 1
    def on_epoch_end(self, epoch, logs=None):
        acc = self.evaluate()
        self.ACC.append(acc)
        if acc > self.best:
            self.best = acc
            train_model.save_weights('best_model.weights')
        print('acc: %.4f, best acc: %.4f\n' % (acc, self.best))
    def evaluate(self):
        A = 1e-10
        F = open('dev_pred.json', 'w')
        for d in tqdm(iter(dev_list)):
            R = extract_entity(d[0])
            if R == d[1]:
                A += 1
            s = ', '.join(d + (R,))
            F.write(s + '\n')
        F.close()
        return A / len(dev_list)

evaluator = Evaluate()
train_D = train_data_generator(train_list)
train_model.fit_generator(train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=2,
    callbacks=[evaluator]
)

In [21]:
extract_entity("今天大乐透开奖了么？")

'大乐透'

In [22]:
extract_entity("奥迪A6多少钱")

'奥迪A6'

In [26]:
# 抽取实体测试
# 输入文本
# 返回实体列表，这里最多返回num个实体
def extract_entity_test(model, text_in, num):
    text_in = text_in[:maxlen]
    _tokens = tokenizer.tokenize(text_in)
    _x1, _x2 = tokenizer.encode(first=text_in)
    _x1, _x2 = np.array([_x1]), np.array([_x2])
    _ps1, _ps2  = model.predict([_x1, _x2])
    _ps1, _ps2 = softmax(_ps1[0]), softmax(_ps2[0])
    
    # 特殊字符转换为负值
    for i, _t in enumerate(_tokens):
        if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
            _ps1[i] -= 10
            
    tg_list = list()
    
    for i in range(num):
        #[0.99977237, 0.00011352481, 4.0782343e-05, 2.4224111e-05, 1.7350189e-05, 1.0297682e-05, 8.015117e-06, 6.223183e-06
        #, 3.117688e-06, 1.7270181e-06, 1.125549e-06, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0]
        #_ps1中的值代表为实体的概率得分，越大越可能是实体的左边界
        #将_ps1按概率值值降序排序
        #num代表选择topN个实体
        start = np.argwhere((_ps1==sorted(_ps1,reverse=True)[i]))[0][0]
        
        # 设置中断的条件，当字符的长度为1并且为特殊字符并且不属于正常字符
        for end in range(start, len(_tokens)):
            _t = _tokens[end]
            if len(_t) == 1 and re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', _t) and _t not in additional_chars:
                break
        # _ps2中的值代表为实体的概率得分
        # argmax()是返回_ps2最大值的索引        
        end = _ps2[start:end+1].argmax() + start
        a = text_in[start-1: end]
        tg_list.append(a)
        tg_list = list(set(tg_list))
        print(i, start, end,a )
    return ';'.join(tg_list)

# 导入模型权重
build_model.load_weights('best_model.weights')

# 预测单个文本的实体
extract_entity_test(build_model, '今天大乐透开奖了吗？', 2)

0 3 5 大乐透
1 4 5 乐透


'乐透;大乐透'