In [1]:

with open('../data/zh.tsv', 'r', encoding='utf-8') as f:
    data = f.readlines()

    inputs = []
    labels = []
    for i in range(len(data)):
        key, pny, word = data[i].split('\t')
        inputs.append(pny.split(' '))
        labels.append(word.strip('\n').split(' '))
    
    for i in range(10):
        print(inputs[i])
        print(labels[i])
        print()

['lv4', 'shi4', 'yang2', 'chun1', 'yan1', 'jing3', 'da4', 'kuai4', 'wen2', 'zhang1', 'de', 'di3', 'se4', 'si4', 'yue4', 'de', 'lin2', 'luan2', 'geng4', 'shi4', 'lv4', 'de2', 'xian1', 'huo2', 'xiu4', 'mei4', 'shi1', 'yi4', 'ang4', 'ran2']
['绿', '是', '阳', '春', '烟', '景', '大', '块', '文', '章', '的', '底', '色', '四', '月', '的', '林', '峦', '更', '是', '绿', '得', '鲜', '活', '秀', '媚', '诗', '意', '盎', '然']

['ta1', 'jin3', 'ping2', 'yao1', 'bu4', 'de', 'li4', 'liang4', 'zai4', 'yong3', 'dao4', 'shang4', 'xia4', 'fan1', 'teng2', 'yong3', 'dong4', 'she2', 'xing2', 'zhuang4', 'ru2', 'hai3', 'tun2', 'yi1', 'zhi2', 'yi3', 'yi1', 'tou2', 'de', 'you1', 'shi4', 'ling3', 'xian1']
['他', '仅', '凭', '腰', '部', '的', '力', '量', '在', '泳', '道', '上', '下', '翻', '腾', '蛹', '动', '蛇', '行', '状', '如', '海', '豚', '一', '直', '以', '一', '头', '的', '优', '势', '领', '先']

['pao4', 'yan3', 'da3', 'hao3', 'le', 'zha4', 'yao4', 'zen3', 'me', 'zhuang1', 'yue4', 'zheng4', 'cai2', 'yao3', 'le', 'yao3', 'ya2', 'shu1', 'de', 'tuo1', 'qu4', 'yi1', 'fu2

In [2]:
def get_vocab(data):
    vocab = ['<PAD>']
    for line in data:
        for char in line:
            if char not in vocab:
                vocab.append(char)
                
    return vocab

pny2id = get_vocab(inputs)
word2id = get_vocab(labels)

print(f'拼音词表长度为: {len(pny2id)}')
print(f'文字词表长度为: {len(word2id)}')

print(pny2id[:10])
print(word2id[:10])

拼音词表长度为: 1152
文字词表长度为: 4460
['<PAD>', 'lv4', 'shi4', 'yang2', 'chun1', 'yan1', 'jing3', 'da4', 'kuai4', 'wen2']
['<PAD>', '绿', '是', '阳', '春', '烟', '景', '大', '块', '文']


In [3]:
input_num = [[pny2id.index(pny) for pny in line] for line in inputs]
label_num = [[word2id.index(word) for word in line] for line in labels]

In [4]:
import numpy as np

def get_batch(input_data, label_data, batch_size):
    batch_num = len(input_data) // batch_size
    for k in range(batch_num):
        begin = k * batch_size
        end = begin + batch_size
        input_batch = input_data[begin: end]
        label_batch = label_data[begin: end]
        max_len = max([len(line) for line in input_batch])
        input_batch = np.array([line + [0] * (max_len-len(line)) for line in input_batch])
        label_batch = np.array([line + [0] * (max_len-len(line)) for line in label_batch])
        yield input_batch, label_batch
        
batch = get_batch(input_num, label_num, 8)
input_batch, label_batch = next(batch)

print(input_batch)
print(label_batch)

[[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  11  16  17
   18   2   1  19  20  21  22  23  24  25  26  27   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [ 28  29  30  31  32  11  33  34  35  36  37  38  39  40  41  36  42  43
   44  45  46  47  48  49  50  51  49  52  11  53   2  54  20   0   0   0
    0   0   0   0   0   0   0]
 [ 55  56  57  58  59  60  61  62  63  64  15  65  66  67  59  67  68  69
   11  70  71  49  72  73  74  75  76  77  59  78  79  42   0   0   0   0
    0   0   0   0   0   0   0]
 [ 80  81  82   9  83  84  28  49  85  86  75  87  88  89  39  56  90  11
   91  92  93  92  94  95  92  96  97  98  32  99 100   0   0   0   0   0
    0   0   0   0   0   0   0]
 [101 102 103 104 105 106 107 108 109 110 111 112 113 114 106 115 116  61
  117 118 119 120 110 121  61 122 123 117 124   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [ 49  77 125 106 126 127 128  59 129 130  97 131 132 133  11 134 135   2
  136 137 138 139 140 141 142 1

In [5]:
import tensorflow as tf

In [6]:
# embedding

def embedding(inputs,
              vocab_size,
              num_units,
              zero_pad=True,
              scope='embedding',
              reuse=None):
    
    '''
    Embeds a given tensor.
    
    :param inputs: A 'Tensor' with type 'int32' or 'int64' containing
                    the ids to be looked up in 'lookup table'.
    :param vocab_size: int. Vocabulary size
    :param num_units: int. Number of embedding hidden units
    :param zero_pad: boolean. If true, all the values of the first row
                    (id 0) should be constant zeros.
    :param scale: boolean, If true. the outputs is multiplied by sqrt
                    num_units.
    :param scope: Optional scope for 'variable_scope'
    :param reuse: Boolean, whether to reuse the weights of a previous
                layer by the same name.
    :return: 
        A Tensor with one more rank than inputs`s. The last dimensionality
        should be 'num_units'
        
        For example,

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[ 0.          0.        ]
      [ 0.09754146  0.67385566]
      [ 0.37864095 -0.35689294]]
     [[-1.01329422 -1.09939694]
      [ 0.7521342   0.38203377]
      [-0.04973143 -0.06210355]]]
    ```

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[-0.19172323 -0.39159766]
      [-0.43212751 -0.66207761]
      [ 1.03452027 -0.26704335]]
     [[-0.11634696 -0.35983452]
      [ 0.50208133  0.53509563]
      [ 1.22204471 -0.96587461]]]
    ```
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)
        
        
    return outputs

In [7]:

# pre-net

def prenet(inputs,
           num_units=None, 
           is_training=True,
           scope='prenet',
           reuse=None,
           dropout_rate=0.2):
    '''
    Pre-net for Encoder and Decoder
    
    :param inputs: A 2D or 3D tensor.
    :param num_units: A list of two integers. or None.
    :param is_training: A python boolean.
    :param scope: Optional scope for 'variable_scope'
    :param reuse: Boolean, whether to reuse the weights of a previous layer by
                    the same name.
    :param dropout_rate: 
    :return: 
        A 3D tensor of shape [N, T, num_units/2].
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        outputs = tf.layers.dense(inputs, units=num_units[0], 
                                  activation=tf.nn.relu, name='dense1')
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, 
                                    training=is_training, name='dropout1')
        outputs = tf.layers.dense(outputs, units=num_units[1],
                                  activation=tf.nn.relu, name='dense2')
        outputs = tf.layers.dropout(outputs, rate=dropout_rate,
                                    training=is_training, name='dropout2')
        
    return outputs      # (N, ..., num_units[1)

In [8]:

# conv1d 

def conv1d(inputs,
           filters=None,
           kernel_size=1,
           rate=1,
           padding='SAME',
           use_bias=False,
           activation_fn=None,
           scope='conv1d',
           reuse=None):
    '''
    
    :param inputs: A 3D tensor with shape of [batch, time, depth]
    :param filters: An int, Number of outputs (activation maps)
    :param size: An int. Filter size.
    :param rate: Dilation rate.
    :param padding: Either 'same' or 'valid' or 'causal' (case-insensitive).
    :param use_bias: A boolean.
    :param activation_fn: 
    :param scope: Optional scope for 'variable_scope'
    :param reuse: Boolean. whether to reuse the weights of a previous
                    layer by the same name.
    :return: 
        A masked tensor of the same shape and dtypes as 'inputs'
    '''
    
    with tf.variable_scope(scope):
        if padding.lower() == 'causal':
            # pre-padding for causality
            pad_len = (kernel_size - 1) * rate  # padding size
            inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
            padding = 'valid'
            
        if filters is None:
            filters = inputs.get_shape().as_list[-1]
            
        params = {'inputs': inputs, 'filters': filters, 
                  'kernel_size': kernel_size,
                  'dilation_rate': rate, 'padding': padding, 
                  'activation': activation_fn, 'use_bias': use_bias,
                  'reuse': reuse}
        
        outputs = tf.layers.conv1d(**params)
        
    return outputs

In [9]:

def normalize(inputs,
              decay=0.99,
              epsilon=1e-8,
              is_training=True,
              activation_fn=None,
              reuse=None,
              scope='normalize'):
    '''
        Applies batch/layer normalization.
        
    :param inputs: A tensor with 2 or more dimensions, 
        where the first dimension has
        `batch_size`. If type is `bn`, the normalization is over all but
        the last dimension. Or if type is `ln`, the normalization is over
        the last dimension. Note that this is different from the native
        `tf.contrib.layers.batch_norm`. For this I recommend you change
        a line in ``tensorflow/contrib/layers/python/layers/layer.py`
        as follows.
        Before: mean, variance = nn.moments(inputs, axis, keep_dims=True)
        After: mean, variance = nn.moments(inputs, [-1], keep_dims=True)
    :param decay: Decay for the moving average. Reasonable values for `decay` are close
        to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
        Lower `decay` value (recommend trying `decay`=0.9) if model experiences
        reasonably good training performance but poor validation and/or test
        performance.
    :param epsilon: 
    :param is_training: Whether or not the layer is in training mode. W
    :param activation_fn: Activation function.
    :param reuse: 
    :param scope: Optional scope for `variable_scope`.

    :return: 
            A tensor with the same shape and data dtype as `inputs`.
    '''
    
    inputs_shape = inputs.get_shape()
    inputs_rank = inputs_shape.ndims
    
    # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster.
    # pay attention to the fact that fused_batch_norm requires shape to 
    # be rank 4 of NHWC
    
    inputs = tf.expand_dims(inputs, axis=1)
    outputs = tf.contrib.layers.batch_norm(inputs=inputs,
                                           decay=decay,
                                           center=True,
                                           scale=True,
                                           updates_collections=None,
                                           is_training=is_training,
                                           scope=scope,
                                           zero_debias_moving_mean=True,
                                           fused=True,
                                           reuse=reuse)
    outputs = tf.squeeze(outputs, axis=1)
    
    if activation_fn:
        outputs = activation_fn(outputs)
        
    return outputs

In [10]:

def conv1d_bank(inputs,
                num_units=None,
                K =16,
                is_training=True,
                scope='conv1d_banks',
                reuse=None):
    '''
    Applies a series of conv1d separately
    
    N: batch size
    T: time steps
    C: embedding hidden units
    
    :param inputs: A 3d tensor with shape of [N, T, C]
    :param num_units: 
    :param K: An int. The size of conv1d banks. That is,
            The 'inputs' are convolved with K filters: 1, 2, ..., K.
    :param is_training: A boolean. This is passed to an argument 
                        of 'batch_normalize'
    :param scope: 
    :param reuse: 
    :return: 
        A 3d tensor with shape of [N, T, K*Hp.embed_size//2]
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        outputs = conv1d(inputs, num_units // 2, 1)
        for k in range(2, K+1):
            with tf.variable_scope(f'num_{k}'):
                output = conv1d(inputs, num_units, k)
                outputs = tf.concat((outputs, output), -1)
                
        outputs = normalize(outputs, is_training=is_training,
                            activation_fn=tf.nn.relu)
        
    return outputs  # (N, T, Hp.embed_size//2*K)

In [11]:

def lstm(inputs, 
         num_units=None, 
         bidirection=False, 
         seqlen=None, 
         scope='lstm',
         reuse=None):
    '''
    Applies a lstm
    
    :param inputs: A 3d tensor with shape of [N, T, C].
    :param num_units: An int. The number of hidden units.
    :param bidirection: A boolean. If True, bidirectional results
                        are concatenated.
    :param seqlen: 
    :param scope: Optional scope for `variable_scope`.
    :param reuse: Boolean, whether to reuse the weights of a previous layer
                by the same name.
    :return: 
        If bidirection is True, a 3d tensor with shape 
        of [N, T, 2*num_units], otherwise [N, T, num_units].
    '''
    with tf.variable_scope(scope, reuse=reuse):
        if num_units is None:
            num_units = inputs.get_shape().as_list[-1]
            
        cell = tf.nn.rnn_cell.LSTMCell(num_units)
        if bidirection:
            cell_bw = tf.nn.rnn_cell.LSTMCell(num_units)
            outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw,
                                                         inputs, 
                                                         sequence_length=seqlen,
                                                         dtype=tf.float32)
            return tf.concat(outputs, 2)
        else:
            outputs, _ = tf.nn.dynamic_rnn(cell, inputs,
                                           sequence_length=seqlen,
                                           dtype=tf.float32)
            
    return outputs

In [12]:
def highwaynet(inputs, 
               num_units=None, 
               scope='highwaynet', 
               reuse=None):
    '''
    Highway networks
    
    :param inputs: A 3D tensor of shape [N, T, W].
    :param num_units: An int or `None`. Specifies the number of units in the highway layer
                    or uses the input size if `None`.
    :param scope: Optional scope for `variable_scope`.
    :param reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    :return: 
            A 3D tensor of shape [N, T, W].
    '''
    
    if not num_units:
        num_units = inputs.get_shape()[-1]
        
    with tf.variable_scope(scope, reuse=reuse):
        H = tf.layers.dense(inputs, units=num_units, 
                            activation=tf.nn.relu, name='dense1')
        T = tf.layers.dense(inputs, units=num_units,
                            activation=tf.nn.sigmoid,
                            bias_initializer=tf.constant_initializer(-1.0),
                            name='dense2')
        
        C = 1. - T
        
        outputs = H * T + inputs * C
        
    return outputs

In [13]:

class CBHG():
    '''Builds a model graph'''
    
    def __init__(self, arg):
        tf.reset_default_graph()
        self.pny_size = arg.pny_size
        self.word_size = arg.word_size
        self.embed_size = arg.embed_size
        self.is_training = arg.is_training
        self.num_highwaynet_blocks = arg.num_highwaynet_blocks
        self.encoder_num_banks = arg.encoder_num_banks
        self.lr = arg.lr
        
        self.x = tf.placeholder(tf.int32, shape=(None, None))
        self.y = tf.placeholder(tf.int32, shape=(None, None))
        
        # character Embedding for x
        enc = embedding(self.x, self.pny_size, 
                        self.embed_size, scope='emb_x')
        
        # Encoder pre-net
        prenet_out = prenet(enc,
                            num_units=[self.embed_size, self.embed_size//2],
                            is_training=self.is_training) # (N, T, E/2)
        
        # Encoder CBHG
        # Conv1D bank
        enc = conv1d_bank(prenet_out, 
                          K=self.encoder_num_banks,
                          num_units=self.embed_size//2,
                          is_training=self.is_training) # (N, T, K*E/2)
        
        # Max pooling
        enc = tf.layers.max_pooling1d(enc, 2, 1, padding='same')
        
        # Conv1D projections
        enc = conv1d(enc, self.embed_size//2, 5, scope='conv1d_1') # (N, T, E/2)
        enc = normalize(enc, is_training=self.is_training,
                        activation_fn=tf.nn.relu, scope='norm1')
        enc = conv1d(enc, self.embed_size//2, 5, scope='conv1d_2') # (N, T, E/2)
        enc = normalize(enc, is_training=self.is_training,
                        activation_fn=None, scope='norm2')
        enc += prenet_out  # (N, T, E/2) # residual connections
        
        # Highway Nets
        for i in range(self.num_highwaynet_blocks):
            enc = highwaynet(enc, num_units=self.embed_size//2,
                             scope=f'highwaynet_{i}') # (N, T, E/2)
            
        # bidirectional lstm
        enc = lstm(enc, self.embed_size//2, True, scope='lstm1')
        
        # Readout
        self.outputs = tf.layers.dense(enc, self.word_size, use_bias=False)
        self.preds = tf.to_int32(tf.argmax(self.outputs, axis=-1))
        
        if self.is_training:
            self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                                       logits=self.outputs)
            self.istarget = tf.to_float(tf.not_equal(self.y, tf.zeros_like(self.y))) # masking
            self.hits = tf.to_float(tf.equal(self.preds, self.y)) * self.istarget
            self.acc = tf.reduce_sum(self.hits) / tf.reduce_sum(self.istarget)
            self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / tf.reduce_sum(self.istarget)
            
            # Training Scheme
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
            
            # Summary
            tf.summary.scalar('mean_loss', self.mean_loss)
            tf.summary.scalar('acc', self.acc)
            self.merged = tf.summary.merge_all()

In [14]:

def create_hparams():
    params = tf.contrib.training.HParams(
        # vocab
        pny_size = 50,
        word_size = 50,
        # embedding size
        embed_size = 300,
        num_highwaynet_blocks = 4,
        encoder_num_banks = 8,
        lr = 0.001,
        is_training = True)
    
    return params

arg = create_hparams()
arg.pny_size = len(pny2id)
arg.word_size = len(word2id)


In [15]:

import os
import time

start = time.time()

epochs = 25
batch_size = 16

g = CBHG(arg)

saver = tf.train.Saver()

with tf.Session() as sess:
    merged = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())
    if os.path.exists('logs/model.meta'):
        saver.restore(sess, '/logs/model')
        
    writer = tf.summary.FileWriter('tensorboard/lm', tf.get_default_graph())
    for k in range(epochs):
        total_loss =  0
        batch_num = len(input_num) // batch_size
        batch = get_batch(input_num, label_num, batch_size)
        for i in range(batch_num):
            input_batch, label_batch = next(batch)
            feed = {g.x: input_batch, g.y: label_batch}
            cost, _ = sess.run([g.mean_loss, g.train_op], feed_dict=feed)
            total_loss += cost
            if (k * batch_num + i) % 10 == 0:
                rs = sess.run(merged, feed_dict=feed)
                writer.add_summary(rs, k * batch_num + i)
                
        print('epochs', k+1, ':average loss = ', total_loss/batch_num)
            
    saver.save(sess, '../logs/model')
    writer.close()

end = time.time()
print(f'训练时间:{(end-start)/60:.2f}')

epochs 1 :average loss =  1.0292338697305523
epochs 2 :average loss =  0.4339743798907101
epochs 3 :average loss =  0.32146026409301703
epochs 4 :average loss =  0.2568893333812958
epochs 5 :average loss =  0.2165330571583802
epochs 6 :average loss =  0.18923001680379892
epochs 7 :average loss =  0.16972323978510528
epochs 8 :average loss =  0.15588265168689705
epochs 9 :average loss =  0.14461949134341714
epochs 10 :average loss =  0.1374188847327826
epochs 11 :average loss =  0.13156710992185136
epochs 12 :average loss =  0.12802302987271877
epochs 13 :average loss =  0.12582620804804512
epochs 14 :average loss =  0.1233449401968287
epochs 15 :average loss =  0.12516787370735658
epochs 16 :average loss =  0.12656381167512273
epochs 17 :average loss =  0.13058180372398157
epochs 18 :average loss =  0.13388134208514327
epochs 19 :average loss =  0.14362116484162576
epochs 20 :average loss =  0.15529048121328284
epochs 21 :average loss =  0.1701302073367387
epochs 22 :average loss =  0.

In [16]:

arg.is_training = False

g = CBHG(arg)

saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess, '../logs/model')
    while True:
        line = input('输入测试拼音: ')
        if line == 'exit': break
        line = line.strip('\n').split(' ')
        x = np.array([pny2id.index(pny) for pny in line])
        x = x.reshape(1, -1)
        preds = sess.run(g.preds, {g.x: x})
        result = ''.join(word2id[idx] for idx in preds[0])
        print(result)

INFO:tensorflow:Restoring parameters from ../logs/model
赛距为职业运动员最长二十四千米业馀运动员最长二十一千米青年运动员最长十五千米
绿是阳春烟景大快文章的底色四月的林峦更是绿得鲜活秀媚诗艺盎然
今天天气不错
天气很热
模行推断
输入测试拼音
