In [54]:
import numpy as np
from tqdm import tqdm
import os, json, codecs
import tensorflow as tf
from bert4keras.bert import build_bert_model
from bert4keras.utils import Tokenizer, load_vocab, parallel_apply
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam
import os
os.environ['CUDA_VISIBLE_DEVICES']='2'

In [55]:
config_path = '/opt/developer/wp/wzcq/roberta_wwm/bert_config.json'
checkpoint_path = '/opt/developer/wp/wzcq/roberta_wwm/bert_model.ckpt'
dict_path = '/opt/developer/wp/wzcq/roberta_wwm/vocab.txt'

In [56]:
def get_token_dict(token_file):
    with open(token_file,"r") as f:
        token_list = f.readlines()
        token_dict = {word.strip():id_ for id_,word in enumerate(token_list)}
    return token_dict


class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R


In [57]:
token_dict = get_token_dict(dict_path)

In [58]:
tokenizer = OurTokenizer(token_dict)

In [62]:
tokenizer.encode('我在上海')

([101, 2769, 1762, 677, 3862, 102], [0, 0, 0, 0, 0, 0])

In [63]:
path = "./ci/ci.song.1000.json"

In [68]:
def read_ci_examples(input_file):
    """Read a tang poet json file into a list """
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)
    examples = []
    for entry in input_data:
        ci = []
        rhythmic = entry["rhythmic"]
        #print("rhythmic",rhythmic)
        ci.append(rhythmic+":")
        s = ""
        for paragraph in entry["paragraphs"]:   
            #print("paragraphs",paragraph)
            s += paragraph
        ci.append(s)
        examples.append(ci)    
    return examples     

In [69]:
data = read_ci_examples(path)

In [70]:
data[:1]

[['踏莎行:',
  '嵩峤云高，洛川波暖。举头乔木森无断。□□□雨绝风尘，小桥频过春渠满。□□离宫，□棱斗焕。万家罗绮多游伴。□□□□自风□，□□是处喧弦管。']]

In [71]:
def padding(x):
    """padding至batch内的最大长度
    """
    ml = max([len(i) for i in x])
    return np.array([i + [0] * (ml - len(i)) for i in x])


def data_generator():
    while True:
        X, S = [], []
        for t,d in data:
            x, s = tokenizer.encode(t,d)
            X.append(x)
            S.append(s)
            if len(X) == batch_size:
                X = padding(X)
                S = padding(S)
                yield [X, S], None
                X, S = [], []

In [73]:
#tokenizer.encode(data[1][0],data[1][1])

In [46]:
data[1][0],data[1][1]

('菩萨蛮:', '子规啼破城楼月。画船晓载笙歌发。两岸荔枝红。万家烟雨中。佳人相对泣。泪下罗衣湿。从此信音稀。岭南无雁飞。')

In [74]:
model = build_bert_model(
    config_path,
    checkpoint_path,
    application='seq2seq',
    # 只保留keep_words中的字，精简原字表
)

model.summary()

# 交叉熵作为loss，并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y = model.output[:, :-1]  # 预测tokens，预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))


==> searching: bert/embeddings/word_embeddings, found name: bert/embeddings/word_embeddings
==> searching: bert/embeddings/position_embeddings, found name: bert/embeddings/position_embeddings
==> searching: bert/embeddings/token_type_embeddings, found name: bert/embeddings/token_type_embeddings
==> searching: bert/embeddings/LayerNorm/gamma, found name: bert/embeddings/LayerNorm/gamma
==> searching: bert/embeddings/LayerNorm/beta, found name: bert/embeddings/LayerNorm/beta
==> searching: bert/encoder/layer_0/attention/self/query/kernel, found name: bert/encoder/layer_0/attention/self/query/kernel
==> searching: bert/encoder/layer_0/attention/self/query/bias, found name: bert/encoder/layer_0/attention/self/query/bias
==> searching: bert/encoder/layer_0/attention/self/key/kernel, found name: bert/encoder/layer_0/attention/self/key/kernel
==> searching: bert/encoder/layer_0/attention/self/key/bias, found name: bert/encoder/layer_0/attention/self/key/bias
==> searching: bert/encoder/layer_

  'be expecting any data to be passed to {0}.'.format(name))


In [93]:
def gen_sent(s, topk=3):
    """beam search解码
    每次只保留topk个最优候选结果；如果topk=1，那么就是贪心搜索
    """
    token_ids, segment_ids = tokenizer.encode(s[:max_input_len])
    target_ids = [[] for _ in range(topk)]  # 候选答案id
    target_scores = [0] * topk  # 候选答案分数
    for i in range(max_output_len)[:3]:  # 强制要求输出不超过max_output_len字
        _target_ids = [token_ids + t for t in target_ids]
        print("_target_ids",_target_ids)
        _segment_ids = [segment_ids + [1] * len(t) for t in target_ids]
        print("_segment_ids",_segment_ids)
        _probas = model.predict([_target_ids, _segment_ids
                                 ])[: ,-1,100:]  # 直接忽略[PAD], [UNK], [CLS]
        print("_probas",_probas)
        _log_probas = np.log(_probas + 1e-6)  # 取对数，方便计算
        _topk_arg = _log_probas.argsort(axis=1)[:, -topk:]  # 每一项选出topk
        _candidate_ids, _candidate_scores = [], []
        for j, (ids, sco) in enumerate(zip(target_ids, target_scores)):
            # 预测第一个字的时候，输入的topk事实上都是同一个，
            # 所以只需要看第一个，不需要遍历后面的。
            if i == 0 and j > 0:
                continue
            for k in _topk_arg[j]:
                _candidate_ids.append(ids + [k + 3])
                _candidate_scores.append(sco + _log_probas[j][k])
        _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
        target_ids = [_candidate_ids[k] for k in _topk_arg]
        print("target_ids",target_ids)
        target_scores = [_candidate_scores[k] for k in _topk_arg]
        print("target_scores", target_scores)
        best_one = np.argmax(target_scores)
        if target_ids[best_one][-1] == 10:
            return tokenizer.decode(target_ids[best_one])
    # 如果max_output_len字都找不到结束符，直接返回
    return tokenizer.decode(target_ids[np.argmax(target_scores)])


In [94]:
def just_show():
    s1 = "菩萨蛮:"
#     s2 = "踏莎行:"
#     for s in [s1, s2]:
    print('生成词:', gen_sent(s1))
    print()
    
    
    
class Evaluate(Callback):
    def __init__(self):
        self.lowest = 1e10

    def on_epoch_end(self, epoch, logs=None):
        # 保存最优
        if logs['loss'] <= self.lowest:
            self.lowest = logs['loss']
            model.save_weights('./best_model.weights')
        # 演示效果
        just_show()


batch_size = 16
steps_per_epoch = 1000  
epochs = 100
max_input_len = 32
max_output_len = 256
        
if __name__ == '__main__':

#     evaluator = Evaluate()

#     model.fit_generator(data_generator(),
#                         steps_per_epoch=steps_per_epoch,
#                         epochs=epochs,
#                         callbacks=[evaluator])
    model.load_weights("best_model.weights")
    just_show()

_target_ids [[101, 5835, 5855, 6037, 131, 102], [101, 5835, 5855, 6037, 131, 102], [101, 5835, 5855, 6037, 131, 102]]
_segment_ids [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
_probas [[6.38667261e-05 2.24637144e-08 5.70373295e-08 ... 1.10356568e-08
  1.62667335e-09 1.15774865e-08]
 [6.38667261e-05 2.24637144e-08 5.70373295e-08 ... 1.10356568e-08
  1.62667335e-09 1.15774865e-08]
 [6.38667261e-05 2.24637482e-08 5.70373082e-08 ... 1.10356746e-08
  1.62667591e-09 1.15775149e-08]]
target_ids [[1929], [4417], [4276]]
target_scores [-3.4190540313720703, -3.100290298461914, -2.9956352710723877]
_target_ids [[101, 5835, 5855, 6037, 131, 102, 1929], [101, 5835, 5855, 6037, 131, 102, 4417], [101, 5835, 5855, 6037, 131, 102, 4276]]
_segment_ids [[0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1]]
_probas [[1.9493209e-05 5.7843141e-10 5.3411046e-13 ... 1.1163006e-09
  2.4262787e-09 1.3938489e-11]
 [1.2881893e-10 7.7174579e-13 3.0951424e-14 ... 1.3348185e-12
  3.089