In [1]:
import json
import tensorflow as tf
import numpy as np
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import os
os.environ['CUDA_VISIBLE_DEVICES']='3'

Using TensorFlow backend.


In [2]:
config_path = '/opt/developer/wp/wzcq/roberta_wwm/bert_config.json'
checkpoint_path = '/opt/developer/wp/wzcq/roberta_wwm/bert_model.ckpt'
dict_path = '/opt/developer/wp/wzcq/roberta_wwm/vocab.txt'

In [3]:
def read_squad_examples(input_file):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]
    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]: 
            paragraph_text = paragraph["context"]
            for qa in paragraph["qas"]:
                train_data = {}
                train_data["context"] = paragraph_text
                qas_id = qa["id"]
                question_text = qa["question"]
                answer = qa["answers"][0]
                orig_answer_text = answer["text"]
                train_data["quary"] = question_text
                train_data["ans"] =  orig_answer_text 
                examples.append(train_data)
    return examples           

In [4]:
def get_token_dict(token_file):
    with open(token_file,"r") as f:
        token_list = f.readlines()
        token_dict = {word.strip():id_ for id_,word in enumerate(token_list)}
    return token_dict


class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R
token_dict = get_token_dict(dict_path)
tokenizer = OurTokenizer(token_dict)

In [5]:
import re
def extract_ans_doc(ans,doc):
    doc = re.sub("\s+","",doc)
    ans = re.sub("\s+","",ans)
    sub_doc = doc.split("。")
    for i in range(len(sub_doc)):
        try:
            if ans in  sub_doc[i]+"。":
                return ans, sub_doc[i-1]+ "。" + sub_doc[i] + "。" + sub_doc[i+1]
        except:
            return ans, sub_doc[i-1]+ "。" + sub_doc[i] + "。"
    else:
#         print ("no ans")
        return None,None
        
        
        
def get_short_data(data):
    new_data_list = []
    for d in data:
#         print(d)
        new_ans,new_context = extract_ans_doc(d["ans"],d["context"])
        if new_context is not None and new_ans is not None:
            d["context"] = new_context
            d["ans"] = new_ans
            new_data_list.append(d)    
        else:
            pass
    return new_data_list    
    

In [6]:
data = read_squad_examples("squad-style-data/cmrc2018_train.json")
data_new = get_short_data(data)

In [7]:
len(data_new)

9906

In [8]:
data_new[1]

{'context': '范廷颂枢机（，），圣名保禄·若瑟（），是越南罗马天主教枢机。1963年被任为主教；1990年被擢升为天主教河内总教区宗座署理；1994年被擢升为总主教，同年年底被擢升为枢机；2009年2月离世。范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生；童年时接受良好教育后，被一位越南神父带到河内继续其学业',
 'quary': '1990年，范廷颂担任什么职务？',
 'ans': '1990年被擢升为天主教河内总教区宗座署理'}

In [9]:
# [len(i["context"]) for i in data]

In [10]:
def seq_padding(X, padding=0,max_len=512):
    ML = max_len
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x[:max_len] for x in X
    ])


def list_find(list1, list2):
    """在list1中寻找子串list2，如果找到，返回第一个下标；
    如果找不到，返回-1。
    """
    n_list2 = len(list2)
    for i in range(len(list1)):
        if list1[i: i+n_list2] == list2:
            return i
    return -1


class data_generator:
    def __init__(self, data, batch_size=16):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, S1, S2 = [], [], [], []
            for i in idxs:
                d = self.data[i]
                text, c = d["context"], d["quary"]
                text = u'___%s___%s' % (c, text)
                tokens = tokenizer.tokenize(text)
                e = d["ans"]
                e_tokens = tokenizer.tokenize(e)[1:-1]
                s1, s2 = np.zeros(len(tokens)), np.zeros(len(tokens))
                start = list_find(tokens, e_tokens)
                if start != -1:
                    end = start + len(e_tokens) - 1
                    s1[start] = 1
                    s2[end] = 1
                    x1, x2 = tokenizer.encode(first=text)
                    X1.append(x1)
                    X2.append(x2)
                    S1.append(s1)
                    S2.append(s2)
                    if len(X1) == self.batch_size or i == idxs[-1]:
                        X1 = seq_padding(X1)
                        X2 = seq_padding(X2)
                        S1 = seq_padding(S1)
                        S2 = seq_padding(S2)
                        yield [X1, X2, S1, S2], None
                        X1, X2, S1, S2 = [], [], [], []


In [14]:
train_D = data_generator(data_new,batch_size=4)

In [12]:
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam


bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

for l in bert_model.layers:
    l.trainable = True

x1_in = Input(shape=(None,)) # 待识别句子输入
x2_in = Input(shape=(None,)) # 待识别句子输入
s1_in = Input(shape=(None,)) # 实体左边界（标签）
s2_in = Input(shape=(None,)) # 实体右边界（标签）

x1, x2, s1, s2 = x1_in, x2_in, s1_in, s2_in
x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1)

x = bert_model([x1, x2])
ps1 = Dense(1, use_bias=False)(x)


###[[0.1],[0.2],[0.3]..] -> [0.1,0.2,0.3,...] 
###[0.1,0.2,0.3,...] - [0,0,0,0,1,1,1,1]*1e10 
ps1 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10)([ps1, x_mask])
# ps1 = Lambda(lambda x: x[0]*x[1])([ps1, x_mask])


ps2 = Dense(1, use_bias=False)(x)
ps2 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10)([ps2, x_mask])
# ps2 = Lambda(lambda x:x[0]*x[1])([ps2, x_mask])

model = Model([x1_in, x2_in], [ps1, ps2])


train_model = Model([x1_in, x2_in, s1_in, s2_in], [ps1, ps2])
###[0,0,1,0,0]  [0.1,0.1,0.8,-1e10,-1e10,-1e10]
loss1 = K.mean(K.categorical_crossentropy(s1_in, ps1, from_logits=True)) 
### K.cumsum(s1, 1) = [0,0,0,0,1,1,1,1]
### ps2 - [1,1,1,1,0,0,0,0]* 1e10 降低ps2在ps1 之前的概率
ps2 -= (1 - K.cumsum(s1, 1)) * 1e10
loss2 = K.mean(K.categorical_crossentropy(s2_in, ps2, from_logits=True))
loss = loss1 + loss2

train_model.add_loss(loss)
train_model.compile(optimizer=Adam(learning_rate=0.00001))
train_model.summary() 

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, None, 768)    101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, None, 1)      768         model_2[1][0]              

  'be expecting any data to be passed to {0}.'.format(name))
  'be expecting any data to be passed to {0}.'.format(name))


In [34]:
train_model.fit_generator(train_D.__iter__(),
                          steps_per_epoch=len(train_D),
                          epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f2eb42a3470>

In [36]:
model.save("mc_weights_new.hf")

In [50]:
def softmax(x):
    x = x - np.max(x)
    x = np.exp(x)
    return x / np.sum(x)

def extract_entity(text_in, c_in):
#     print(text_in)
    print(c_in)
    text_in = u'___%s___%s' % (c_in, text_in)
    text_in = text_in[:510]
    _tokens = tokenizer.tokenize(text_in)
    _x1, _x2 = tokenizer.encode(first=text_in)
    _x1, _x2 = np.array([_x1]), np.array([_x2])
    _ps1, _ps2  = model.predict([_x1, _x2])
    _ps1, _ps2 = softmax(_ps1[0]), softmax(_ps2[0])
    start = _ps1.argmax()
    print(start)
    end = _ps2[start:].argmax() + start
    print(end)
    a = text_in[start-1: end]
    return a

In [51]:
data_test = read_squad_examples("squad-style-data/cmrc2018_dev.json")

In [52]:
extract_entity(data_test[9]["context"],data_test[9]["quary"])

广三铁路在哪年建成？
415
419


'1903年'

In [54]:
extract_entity('''恺英网络股份有限公司（SZ.002517）是国内领先的互动娱乐综合服务商。
主要业务涵盖页游与手游等游戏业务的研发、运营及发行，网页游戏平台、移动应用分发平台的运营，
以及VR、大数据智能处理等互联网高科技业务投资。恺英网络始终秉持“专注品质、用心服务”的理念，
坚持“研发+发行双轮驱动”，并在全球范围内搜寻和引进优质IP，通过游戏产品（移动游戏、网页游戏、H5游戏）
和发行平台（XY页游平台、XY助手、XY游、MG游戏）进行横向延伸，通过电竞、动画、漫画、影视剧等泛娱乐内容进行纵向布局，
全力为用户打造优质感官体验。旗下上海恺英、浙江盛和及浙江九翎先后开发并运营了《摩天大楼》、《蜀山传奇》、《全民奇迹MU》、
《王者传奇》、《蓝月传奇》、《敢达争锋对决》、《战舰世界闪击战》、H5游戏《传奇来了》等多款热门游戏。
未来，恺英网络将持续深耕游戏业务，围绕“游戏+内容+互联网高科技”的战略部署，强化产业链上下游实力，拓展海外市场，
继续为用户提供优质内容服务和深度游戏娱乐体验，打造卓越的互联网游戏上市公司。''',"恺英的主要业务是什么？")

恺英的主要业务是什么？
379
475


'\n未来，恺英网络将持续深耕游戏业务，围绕“游戏+内容+互联网高科技”的战略部署，强化产业链上下游实力，拓展海外市场，\n继续为用户提供优质内容服务和深度游戏娱乐体验，打造卓越的互联网游戏上市公司。'