In [1]:
import numpy as np
from tqdm import tqdm
import os, json, codecs
import tensorflow as tf
from bert4keras.bert import build_bert_model
from bert4keras.utils import Tokenizer, load_vocab, parallel_apply
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.callbacks import Callback
from keras.optimizers import Adam
import os
# os.environ['CUDA_VISIBLE_DEVICES']='2'

Using TensorFlow backend.


In [2]:
config_path = './multilingual_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './multilingual_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './multilingual_L-12_H-768_A-12/vocab.txt'

In [3]:
def get_token_dict(token_file):
    with open(token_file,"r") as f:
        token_list = f.readlines()
        token_dict = {word.strip():id_ for id_,word in enumerate(token_list)}
    return token_dict


# class OurTokenizer(Tokenizer):
#     def _tokenize(self, text):
#         R = []
#         for c in text:
#             if c in self._token_dict:
#                 R.append(c)
#             elif self._is_space(c):
#                 R.append('[unused1]') # space类用未经训练的[unused1]表示
#             else:
#                 R.append('[UNK]') # 剩余的字符是[UNK]
#         return R


In [4]:
token_dict = get_token_dict(dict_path)

In [5]:
tokenizer = Tokenizer(token_dict)

In [6]:
path = "./translation2019zh_train.json"

In [7]:
def read_data_examples(input_file):
    """Read a tang poet json file into a list """
    data = []
    with open(input_file, "r") as reader:
        for line in reader.readlines():
            line = line.strip()
            line = eval(line)
            data.append(line)
    return data 

In [8]:
data = read_data_examples(path)

In [9]:
def padding(seq,max_len=512):
    """padding至batch内的最大长度
    """
    ML = max_len
    return np.array([
        np.concatenate([x, [0] * (ML - len(x))]) if len(x) < ML else x[:max_len] for x in seq
    ])


def data_generator():
    while True:
        X, S = [], []
        for line in data:
            x, s = tokenizer.encode(line["chinese"].lower(),line["english"].lower())
#             print(x,s)
            X.append(x)
            S.append(s)
            if len(X) == batch_size:
                X = padding(X)
                S = padding(S)
                yield [X, S], None
                X, S = [], []

In [11]:
model = build_bert_model(
    config_path,
    checkpoint_path,
    application='seq2seq',
    # 只保留keep_words中的字，精简原字表
)

model.summary()

# 交叉熵作为loss，并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y = model.output[:, :-1]  # 预测tokens，预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))
# from keras.utils.training_utils import multi_gpu_model   #导入keras多GPU函数
# parallel_model = multi_gpu_model(model, gpus=3)#设置使用2个gpu，该句放在模型compile之前
# parallel_model.compile(optimizer=Adam(1e-5))



==> searching: bert/embeddings/word_embeddings, found name: bert/embeddings/word_embeddings
==> searching: bert/embeddings/position_embeddings, found name: bert/embeddings/position_embeddings
==> searching: bert/embeddings/token_type_embeddings, found name: bert/embeddings/token_type_embeddings
==> searching: bert/embeddings/LayerNorm/gamma, found name: bert/embeddings/LayerNorm/gamma
==> searching: bert/embeddings/LayerNorm/beta, found name: bert/embeddings/LayerNorm/beta
==> searching: bert/encoder/layer_0/attention/self/query/kernel, found name: bert/encoder/layer_0/attention/self/query/kernel
==> searching: bert/encoder/layer_0/attention/self/query/bias, found name: bert/encoder/layer_0/attention/self/query/bias
==> searching: bert/encoder/layer_0/attention/self/key/kernel, found name: bert/encoder/layer_0/attention/self/key/kernel
==> searching: bert/encoder/layer_0/attention/self/key/bias, found name: bert/encoder/layer_0/attention/self/key/bias
==> searching: bert/encoder/layer_

  'be expecting any data to be passed to {0}.'.format(name))


In [12]:
def gen_sent(s, topk=3):
    """beam search解码
    每次只保留topk个最优候选结果；如果topk=1，那么就是贪心搜索
    """
    token_ids, segment_ids = tokenizer.encode(s[:max_input_len])
    target_ids = [[] for _ in range(topk)]  # 候选答案id
    target_scores = [0] * topk  # 候选答案分数
    for i in range(max_output_len):  # 强制要求输出不超过max_output_len字
        _target_ids = [token_ids + t for t in target_ids]
#         print("_target_ids",_target_ids)
        _segment_ids = [segment_ids + [1] * len(t) for t in target_ids]
#         print("_segment_ids",_segment_ids)
        _probas = model.predict([_target_ids, _segment_ids
                                 ])[: ,-1,3:]  # 直接忽略[PAD], [UNK], [CLS]
#         print("_probas",_probas)
        _log_probas = np.log(_probas + 1e-6)  # 取对数，方便计算
        _topk_arg = _log_probas.argsort(axis=1)[:, -topk:]  # 每一项选出topk
        _candidate_ids, _candidate_scores = [], []
        for j, (ids, sco) in enumerate(zip(target_ids, target_scores)):
            # 预测第一个字的时候，输入的topk事实上都是同一个，
            # 所以只需要看第一个，不需要遍历后面的。
            if i == 0 and j > 0:
                continue
            for k in _topk_arg[j]:
                _candidate_ids.append(ids + [k + 3])
                _candidate_scores.append(sco + _log_probas[j][k])
        _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
        target_ids = [_candidate_ids[k] for k in _topk_arg]
        target_scores = [_candidate_scores[k] for k in _topk_arg]
#         print("target_scores", target_scores)
        best_one = np.argmax(target_scores)
        if target_ids[best_one][-1] == token_dict.get("[SEP]"):
            return tokenizer.decode(target_ids[best_one])
        
    # 如果max_output_len字都找不到结束符，直接返回
    return tokenizer.decode(target_ids[np.argmax(target_scores)])


In [13]:
len(token_dict)

105879

In [18]:
def just_show():
    s1 = "我们学习python Web开发时，会选择使用Django、flask等框架。"
    print('翻译结果:', gen_sent(s1.lower()))
    
    
    
class Evaluate(Callback):
    def __init__(self):
        self.lowest = 1e10

    def on_epoch_end(self, epoch, logs=None):
        # 保存最优
        if logs['loss'] <= self.lowest:
            self.lowest = logs['loss']
            model.save_weights('./best_trans.weights')
        # 演示效果
        just_show()


batch_size = 2
steps_per_epoch = 2000
epochs = 100
max_input_len = 256
max_output_len = 256
        
if __name__ == '__main__':

    evaluator = Evaluate()

#     model.fit_generator(data_generator(),
#                         steps_per_epoch=steps_per_epoch,
#                         epochs=epochs,
#                         callbacks=[evaluator])
    model.load_weights("best_trans.weights")
    just_show()

翻译结果: when we learn about python web development, we choose to use django and flask frameworks.
