In [1]:
from tqdm import tqdm
import os, json, codecs
from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
from keras.layers import *
from keras import backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam
import pandas as pd

Using TensorFlow backend.


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

data_base_dir = "/search/odin/liuyouyuan/pyproject/data/weibo_source"
article_file = "dev_art.txt"
abstract_file = "dev_abs.txt"
vocab_path = "./data/weibo_vocab.json"

# bert 相关
config_path = 'chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'chinese_L-12_H-768_A-12/vocab.txt'

min_count = 0
max_input_len = 256
max_output_len = 32
batch_size = 16
steps_per_epoch = 1000
epochs = 10

print(f"args:{min_count}-{max_input_len}-{max_output_len}-{batch_size}")
model_name = './weibo_model/model_{}.weights'.format(min_count)

args:0-256-32-16


In [4]:
def read_text(art_file, abs_file):
    with open(art_file, "r") as art_f, open(abs_file, "r") as abs_f:
        for t, s in zip(art_f, abs_f):
            if len(s) <= max_output_len:
                yield t[:max_input_len], s

                
def build_vocab_json(vocab_json, data):
    if os.path.exists(vocab_json):
        chars_dic = json.load(open(vocab_json, encoding='utf-8'))
    else:
        chars_dic = {}
        for tup in tqdm(data, desc='构建字表中'):
            for tex in tup:
                for c in tex:
                    chars_dic[c] = chars_dic.get(c, 0) + 1
        chars_dic = [(i, j) for i, j in chars_dic.items() if j >= min_count]
        chars_dic = sorted(chars_dic, key=lambda c: - c[1])
        chars_dic = [c[0] for c in chars_dic]
        json.dump(
            chars_dic,
            codecs.open(vocab_json, 'w', encoding='utf-8'),
            indent=4,
            ensure_ascii=False
        )
    print("构建字表成功：", vocab_json) 
    return chars_dic


def padding(x):
    """
    padding至batch内的最大长度
    """
    ml = max([len(i) for i in x])
    return np.array([i + [0] * (ml - len(i)) for i in x])


def data_generator(tokenizer, art_abs_data):
    """构造输入数据流"""
    while True:
        X, Y = [], []
        for art, abstract in art_abs_data:
            x, y = tokenizer.encode(art, abstract)
            X.append(x)
            Y.append(y)
            if len(X) == batch_size:
                X = padding(X)
                Y = padding(Y)
                yield [X, Y], None
                X, Y = [], []

def gen_sent(model, tokenizer, s, topk=2):
    """
    beam search解码
    每次只保留topk个最优候选结果；如果topk=1，那么就是贪心搜索
    """
    token_ids, segment_ids = tokenizer.encode(s[:max_input_len])
    # 候选答案id
    target_ids = [[] for _ in range(topk)]
    # 候选答案分数
    target_scores = [0] * topk
    # 强制要求输出不超过max_output_len字
    for i in range(max_output_len):
        _target_ids = [token_ids + t for t in target_ids]
        _segment_ids = [segment_ids + [1] * len(t) for t in target_ids]
        # 直接忽略[PAD], [UNK], [CLS]
        _probas = model.predict([_target_ids, _segment_ids])[:, -1, 3:]
        # 取对数，方便计算
        _log_probas = np.log(_probas + 1e-6)
        # 每一项选出topk
        _topk_arg = _log_probas.argsort(axis=1)[:, -topk:]
        _candidate_ids, _candidate_scores = [], []
        for j, (ids, sco) in enumerate(zip(target_ids, target_scores)):
            # 预测第一个字的时候，输入的topk事实上都是同一个，
            # 所以只需要看第一个，不需要遍历后面的。
            if i == 0 and j > 0:
                continue
            for k in _topk_arg[j]:
                _candidate_ids.append(ids + [k + 3])
                _candidate_scores.append(sco + _log_probas[j][k])
        _topk_arg = np.argsort(_candidate_scores)[-topk:]
        for j, k in enumerate(_topk_arg):
            target_ids[j].append(_candidate_ids[k][-1])
            target_scores[j] = _candidate_scores[k]
        ends = [j for j, k in enumerate(target_ids) if k[-1] == 3]
        if len(ends) > 0:
            k = np.argmax([target_scores[j] for j in ends])
            return tokenizer.decode(target_ids[ends[k]])
    # 如果max_output_len字都找不到结束符，直接返回
    return tokenizer.decode(target_ids[np.argmax(target_scores)])


def show(model, tokenizer, s_list):
    for s in s_list:
        print('生成摘要:', gen_sent(model, tokenizer, s))
    print()


class Evaluate(Callback):
    def __init__(self):
        super().__init__()
        self.lowest = 1e10

    def on_epoch_end(self, epoch, logs=None):
        # 保存最优
        if logs['loss'] <= self.lowest:
            self.lowest = logs['loss']
            model.save_weights(model_name)
        # 演示效果
        show(model, tokenizer, s_list)

In [6]:
def write_file(filename, s):
    with open(filename, "w+") as f:
        f.write(s)

In [5]:
# 从文件读取文章与参考摘要
art_file = os.path.join(data_base_dir, article_file)
abs_file = os.path.join(data_base_dir, abstract_file)
data = read_text(art_file, abs_file)

# 构建自己的字表
vocab_chars_dic = build_vocab_json(vocab_path,data)
# 读取bert词典
_token_dict = load_vocab(dict_path)
# 构建新的token_dict 用于构建Tokenizer
# keep_words是在bert中保留的字表
token_dict, keep_words = {}, []
for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
    token_dict[c] = len(token_dict)
    keep_words.append(_token_dict[c])
for c in vocab_chars_dic:
    if c in _token_dict:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])       
# 建立分词器
tokenizer = SimpleTokenizer(token_dict)

# 定义模型
model = load_pretrained_model(
    config_path,
    checkpoint_path,
    seq2seq=True,
    keep_words=keep_words,  # 只保留keep_words中的字，精简原字表
)

model.summary()

# 交叉熵作为loss，并mask掉输入部分的预测
# 目标tokens
y_in = model.input[0][:, 1:]
y_mask = model.input[1][:, 1:]
# 预测tokens，预测与目标错开一位
y = model.output[:, :-1]
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))

构建字表成功： ./data/weibo_vocab.json
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    4855296     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]              
_____________________________________________________________________________

In [15]:
if __name__ == "__main__":
    # 训练
#     evaluator = Evaluate()
#     model.fit_generator(
#          data_generator(tokenizer, data),
#          steps_per_epoch=steps_per_epoch,
#          epochs=epochs,
#          callbacks=[evaluator]
#      )
    # 预测效果
    model_path = "./model/weibo_model_35.weights"
    model.load_weights(model_name)
    ref_dir = "./model/decoder/ref"
    dec_dir = "./model/decoder/dec"
    for d in [ref_dir, dec_dir]:
        if not os.path.isdir(d):
            os.makedirs(d)
    ref_file = "{:06d}_reference.txt"
    dec_file = "{:06d}_decoded.txt"
    n = 0
    for art, abstract in data:
        ref_f = os.path.join(ref_dir, ref_file.format(n))
        dec_f = os.path.join(dec_dir, dec_file.format(n))
        dec_abstract = gen_sent(model, tokenizer, art, topk=2)
        write_file(dec_f, dec_abstract)
        write_file(ref_f, abstract)
        n += 1
    print("Decoder Done!")    
    #show(model, tokenizer, s_list)

KeyboardInterrupt: 