In [1]:
from tqdm import tqdm

with open('../data/zh.tsv', 'r', encoding='utf-8') as f:
    data = f.readlines()

    inputs = []
    labels = []
    for i in range(len(data)):
        key, pny, word = data[i].split('\t')
        inputs.append(pny.split(' '))
        labels.append(word.strip('\n').split(' '))
    
    for i in range(10):
        print(inputs[i])
        print(labels[i])
        print()

['lv4', 'shi4', 'yang2', 'chun1', 'yan1', 'jing3', 'da4', 'kuai4', 'wen2', 'zhang1', 'de', 'di3', 'se4', 'si4', 'yue4', 'de', 'lin2', 'luan2', 'geng4', 'shi4', 'lv4', 'de2', 'xian1', 'huo2', 'xiu4', 'mei4', 'shi1', 'yi4', 'ang4', 'ran2']
['绿', '是', '阳', '春', '烟', '景', '大', '块', '文', '章', '的', '底', '色', '四', '月', '的', '林', '峦', '更', '是', '绿', '得', '鲜', '活', '秀', '媚', '诗', '意', '盎', '然']

['ta1', 'jin3', 'ping2', 'yao1', 'bu4', 'de', 'li4', 'liang4', 'zai4', 'yong3', 'dao4', 'shang4', 'xia4', 'fan1', 'teng2', 'yong3', 'dong4', 'she2', 'xing2', 'zhuang4', 'ru2', 'hai3', 'tun2', 'yi1', 'zhi2', 'yi3', 'yi1', 'tou2', 'de', 'you1', 'shi4', 'ling3', 'xian1']
['他', '仅', '凭', '腰', '部', '的', '力', '量', '在', '泳', '道', '上', '下', '翻', '腾', '蛹', '动', '蛇', '行', '状', '如', '海', '豚', '一', '直', '以', '一', '头', '的', '优', '势', '领', '先']

['pao4', 'yan3', 'da3', 'hao3', 'le', 'zha4', 'yao4', 'zen3', 'me', 'zhuang1', 'yue4', 'zheng4', 'cai2', 'yao3', 'le', 'yao3', 'ya2', 'shu1', 'de', 'tuo1', 'qu4', 'yi1', 'fu2

In [2]:
def get_vocab(data):
    vocab = ['<PAD>']
    for line in data:
        for char in line:
            if char not in vocab:
                vocab.append(char)
                
    return vocab

pny2id = get_vocab(inputs)
word2id = get_vocab(labels)

print(f'拼音词表长度为: {len(pny2id)}')
print(f'文字词表长度为: {len(word2id)}')

print(pny2id[:10])
print(word2id[:10])

拼音词表长度为: 1152
文字词表长度为: 4460
['<PAD>', 'lv4', 'shi4', 'yang2', 'chun1', 'yan1', 'jing3', 'da4', 'kuai4', 'wen2']
['<PAD>', '绿', '是', '阳', '春', '烟', '景', '大', '块', '文']


In [4]:
input_num = [[pny2id.index(pny) for pny in line] for line in inputs]
label_num = [[word2id.index(word) for word in line] for line in labels]

In [5]:
import numpy as np

def get_batch(input_data, label_data, batch_size):
    batch_num = len(input_data) // batch_size
    for k in range(batch_num):
        begin = k * batch_size
        end = begin + batch_size
        input_batch = input_data[begin: end]
        label_batch = label_data[begin: end]
        max_len = max([len(line) for line in input_batch])
        input_batch = np.array([line + [0] * (max_len-len(line)) for line in input_batch])
        label_batch = np.array([line + [0] * (max_len-len(line)) for line in label_batch])
        yield input_batch, label_batch
        
batch = get_batch(input_num, label_num, 8)
input_batch, label_batch = next(batch)

print(input_batch)
print(label_batch)

[[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  11  16  17
   18   2   1  19  20  21  22  23  24  25  26  27   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [ 28  29  30  31  32  11  33  34  35  36  37  38  39  40  41  36  42  43
   44  45  46  47  48  49  50  51  49  52  11  53   2  54  20   0   0   0
    0   0   0   0   0   0   0]
 [ 55  56  57  58  59  60  61  62  63  64  15  65  66  67  59  67  68  69
   11  70  71  49  72  73  74  75  76  77  59  78  79  42   0   0   0   0
    0   0   0   0   0   0   0]
 [ 80  81  82   9  83  84  28  49  85  86  75  87  88  89  39  56  90  11
   91  92  93  92  94  95  92  96  97  98  32  99 100   0   0   0   0   0
    0   0   0   0   0   0   0]
 [101 102 103 104 105 106 107 108 109 110 111 112 113 114 106 115 116  61
  117 118 119 120 110 121  61 122 123 117 124   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [ 49  77 125 106 126 127 128  59 129 130  97 131 132 133  11 134 135   2
  136 137 138 139 140 141 142 1

In [6]:
import tensorflow as tf


In [7]:

# layer normalization

def layer_normalization(inputs,
              epsilon=1e-8,
              scope='ln',
              reuse=None):
    '''
    Applies layer normalization
    
    :param inputs: A tensor with 2 or more dimensions, 
                    where the first dimension has 'batch_size'
    :param epsilon: A floating number. A very small 
                    number for preventing ZeroDivision Error
    :param scope: Optional scope for 'variable_scope'
    :param reuse: Boolean, whether to reuse the weights of 
                    a previous layer by the same name
    :return: 
        A tensor with the same shape and data dtype as 'inputs'
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
        
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.Variable(tf.zeros(params_shape))
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ((variance + epsilon)**0.5)
        outputs = gamma * normalized + beta
        
    return outputs


In [8]:

# embedding

def embedding(inputs,
              vocab_size,
              num_units,
              zero_pad=True,
              scale=True,
              scope='embedding',
              reuse=None):
    
    '''
    Embeds a given tensor.
    
    :param inputs: A 'Tensor' with type 'int32' or 'int64' containing
                    the ids to be looked up in 'lookup table'.
    :param vocab_size: int. Vocabulary size
    :param num_units: int. Number of embedding hidden units
    :param zero_pad: boolean. If true, all the values of the first row
                    (id 0) should be constant zeros.
    :param scale: boolean, If true. the outputs is multiplied by sqrt
                    num_units.
    :param scope: Optional scope for 'variable_scope'
    :param reuse: Boolean, whether to reuse the weights of a previous
                layer by the same name.
    :return: 
        A Tensor with one more rank than inputs`s. The last dimensionality
        should be 'num_units'
        
        For example,

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[ 0.          0.        ]
      [ 0.09754146  0.67385566]
      [ 0.37864095 -0.35689294]]
     [[-1.01329422 -1.09939694]
      [ 0.7521342   0.38203377]
      [-0.04973143 -0.06210355]]]
    ```

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[-0.19172323 -0.39159766]
      [-0.43212751 -0.66207761]
      [ 1.03452027 -0.26704335]]
     [[-0.11634696 -0.35983452]
      [ 0.50208133  0.53509563]
      [ 1.22204471 -0.96587461]]]
    ```
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)
        
        if scale:
            outputs = outputs * (num_units ** 0.5)
            
        
    return outputs


In [9]:

# multi-head attention

def multihead_attention(emb,
                        queries,
                        keys,
                        num_units=None,
                        num_heads=8,
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope='multihead_attention',
                        reuse=None):
    
    '''
    Applies multihead attention.
    
    :param emb: 
    :param queries: A 3d tensor with shape of [N, T_q, C_q].
    :param keys: A 3d tensor with shape of [N, T_k, C_k].
    :param num_units: A scalar. Attention size.
    :param num_heads: int. Number of heads.
    :param dropout_rate: A floating point number.
    :param is_training: Boolean. Controller of mechanism for dropout.
    :param causality: boolean. If true, units that reference the future
                        are masked.
    :param scope: Optional scope for 'variable_scope'
    :param reuse: boolean, whether to reuse the weights of a previous layer
                    by the same name.
    :return: 
            A 3d tensor with shape of (N, T_q, C)
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list[-1]
            
        # Linear projections
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        
        # Split and concat
        Q_concat = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h)
        K_concat = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
        V_concat = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
        
        # Multiplication
        attention = tf.matmul(Q_concat, tf.transpose(K_concat, [0, 2, 1])) # (h*N, T_q, T_k)
        
        # Scale
        attention = attention / (K_concat.get_shape().as_list()[-1] ** 0.5)
        
        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(emb, axis=-1))) # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_K)
        
        paddings = tf.ones_like(attention) * (-2**32+1)
        attention = tf.where(tf.equal(key_masks, 0), paddings, attention) # (h*N, T_q, T_k)
        
        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(attention[0, :, :]) # (T_q, T_k)
            tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(attention)[0], 1, 1]) # (h*N, T_q, T_k)
            
            paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
            attention = tf.where(tf.equal(masks, 0), paddings, attention) # (h*N, T_q, T_k)
            
        # Activation
        attention = tf.nn.softmax(attention) # (h*N, T_q, T_k)
        
        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(emb, axis=-1))) # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
        attention *= query_masks # broadcasting. (N, T_q, C)
        
        # Dropouts
        attention = tf.layers.dropout(attention, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
        
        # Weighted sum
        outputs = tf.matmul(attention, V_concat) # (h*N, T_q, C/h)
        
        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C)
        
        # Residual connection
        outputs += queries
        
        # Normalization
        outputs = layer_normalization(outputs) # (N, T_q, C)
        
    return outputs


In [10]:

def feedforward(inputs,
                num_units=[2048, 512],
                scope='multihead_attention',
                reuse=None):
    
    '''
    Point-wise feed forward net.
    
    :param inputs: A 3d tensor with shape of [N, T, C].
    :param num_units: A list of two integers.
    :param scope: Optional scope for 'variable_scope'
    :param reuse: Boolean, whether to reuse the weights 
                of a previous layer by the same name.
    :return: 
    A 3d tensor with the same shape and dtype as inputs
    '''
    
    with tf.variable_scope(scope, reuse=reuse):
        # Inner layer
        params = {'inputs': inputs, 'filters': num_units[0],
                  'kernel_size': 1, 'activation': tf.nn.relu,
                  'use_bias': True}
        
        outputs = tf.layers.conv1d(**params)
        
        # Readout layer
        params = {'inputs': outputs, 'filters': num_units[1],
                  'kernel_size': 1, 'activation': None,
                  'use_bias': True}
        
        outputs = tf.layers.conv1d(**params)
        
        # Residual connection
        outputs += inputs
        
        # Normalize
        outputs = layer_normalization(outputs)
        
    return outputs
    

In [11]:

def label_smoothing(inputs, epsilon=0.1):
    '''
    Applies label smoothing.
    
    :param inputs: A 3d tensor with shape of [N, T, V],
                where V is the number of vocabulary
    :param epsilon: Smoothing rate.
    :return: 
    
    For example,
    
    ```
    import tensorflow as tf
    inputs = tf.convert_to_tensor([[[0, 0, 1], 
       [0, 1, 0],
       [1, 0, 0]],
      [[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]]], tf.float32)
       
    outputs = label_smoothing(inputs)
    
    with tf.Session() as sess:
        print(sess.run([outputs]))
    
    >>
    [array([[[ 0.03333334,  0.03333334,  0.93333334],
        [ 0.03333334,  0.93333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334]],
       [[ 0.93333334,  0.03333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334],
        [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
    ```
    '''
    
    K = inputs.get_shape().as_list()[-1] # number of channels
    
    return ((1-epsilon) * inputs) + (epsilon / K)


In [12]:

class Graph():
    def __init__(self, arg, is_training=True):
        tf.reset_default_graph()
        self.is_training = arg.is_training
        self.hidden_units = arg.hidden_units
        self.input_vocab_size = arg.input_vocab_size
        self.label_vocab_size = arg.label_vocab_size
        self.num_heads = arg.num_heads
        self.num_blocks = arg.num_blocks
        self.max_length = arg.max_length
        self.lr = arg.lr
        self.dropout_rate = arg.dropout_rate
        
        # input
        self.x = tf.placeholder(tf.int32, shape=(None, None))
        self.y = tf.placeholder(tf.int32, shape=(None, None))
        
        # embedding
        self.emb = embedding(self.x, 
                             vocab_size=self.input_vocab_size,
                             num_units=self.hidden_units,
                             scale=True, scope='enc_embed')
        
        self.enc = self.emb + embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                        vocab_size=self.max_length, num_units=self.hidden_units,
                                        zero_pad=False, scale=False, scope='enc_pe')
        
        # Dropout
        self.enc = tf.layers.dropout(self.enc,
                                     rate=self.dropout_rate,
                                     training=tf.convert_to_tensor(self.is_training))
        
        # Blocks
        for i in range(self.num_blocks):
            with tf.variable_scope(f'num_blocks_{i}'):
                # Multihead Attention
                self.enc = multihead_attention(emb=self.emb,
                                               queries=self.enc,
                                               keys=self.enc,
                                               num_units=self.hidden_units,
                                               num_heads=self.num_heads,
                                               dropout_rate=self.dropout_rate,
                                               is_training=self.is_training,
                                               causality=False)
                
        # Feed Forward
        self.outputs = feedforward(self.enc, num_units=[4*self.hidden_units, self.hidden_units])
        
        # Final linear projection
        self.logits = tf.layers.dense(self.outputs, self.label_vocab_size)
        self.preds = tf.to_int32(tf.argmax(self.logits, axis=-1))
        # 去掉填充的部分
        self.istarget = tf.to_float(tf.not_equal(self.y, 0))
        self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget))
        tf.summary.scalar('acc', self.acc)
        
        if is_training:
            # Loss
            self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=self.label_vocab_size))
            self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
            self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / (tf.reduce_sum(self.istarget))
            
            # Training Scheme
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
            self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
            
            tf.summary.scalar('mean_loss', self.mean_loss)
            self.merged = tf.summary.merge_all()
        

In [13]:

def create_hparams():
    params = tf.contrib.training.HParams(num_heads=8,
                                         num_blocks=6,
                                         input_vocab_size=50,
                                         label_vocab_size=50,
                                         max_length=100,
                                         hidden_units=512,
                                         dropout_rate=0.2,
                                         lr=0.0003,
                                         is_training=True)
    
    return params

arg = create_hparams()
arg.input_vocab_size = len(pny2id)
arg.label_vocab_size = len(word2id)

In [14]:

import os
import time

start = time.time()

epochs = 50
batch_size = 64

g = Graph(arg)

saver = tf.train.Saver()

with tf.Session() as sess:
    merged = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())
    if os.path.exists('../logs/model.meta'):
        saver.restore(sess, '../logs/model')
    
    writer = tf.summary.FileWriter('../tensorboard/lm', tf.get_default_graph())
    for k in range(epochs):
        total_loss = 0
        batch_num = len(input_num) // batch_size
        batch = get_batch(input_num, label_num, batch_size)
        for i in range(batch_num):
            input_batch, label_batch = next(batch)
            feed = {g.x: input_batch, g.y: label_batch}
            cost, _ = sess.run([g.mean_loss, g.train_op], feed_dict=feed)
            total_loss += cost
            if (k * batch_num + i) % 10 == 0:
                rs = sess.run(merged, feed_dict=feed)
                writer.add_summary(rs, k*batch_num + i)
                
        print('epochs', k+1, ': average loss =', total_loss/batch_num)
            
    saver.save(sess, '../logs/model')
    writer.close()

print(f'训练时间为: {(time.time()-start)/60:.2f}')
    

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

epochs 1 : average loss = 1.7576312879450506
epochs 2 : average loss = 1.5597799746569696
epochs 3 : average loss = 1.509507176207393
epochs 4 : average loss = 1.4711973466739237
epochs 5 : average loss = 1.443885849163287
epochs 6 : average loss = 1.424225844267428
epochs 7 : average loss = 1.407690567253346
epochs 8 : average loss = 1.3937231061971664
epochs 9 : average loss = 1.3822234949756387
epochs 10 : average loss = 1.372285391564509
epochs 11 : average loss = 1.3636000377981192
epochs 12 : average loss = 1.3566907827935895
epochs 13 : average loss = 1.3501691292357416
epochs 14 : average loss = 1.3442635642059182
epochs 15 : average loss = 1.3388745027945486
epochs 16 : average loss = 1.3339690614071624
epochs 17 : average loss = 1.3294396614644313
epochs 18 : average loss = 1.325464725198

In [23]:

arg.is_training = False

g = Graph(arg)

saver = tf.train.Saver()

with tf.Session() as sess:
    saver.restore(sess, '../logs/model')
    while True:
        line = input('输入测试拼音: ')
        if line == 'exit': break
        line = line.strip('\n').split(' ')
        x = np.array([pny2id.index(pny) for pny in line])
        x = x.reshape(1, -1)
        preds = sess.run(g.preds, {g.x: x})
        result = ''.join(word2id[idx] for idx in preds[0])
        print(line)
        print(result)
        

INFO:tensorflow:Restoring parameters from ../logs/model
['zhou1', 'jie2', 'lun2']
周杰伦
['lin2', 'jun4', 'jie2']
林俊杰
['wang2', 'jun4', 'kai3']
王俊凯
['si4', 'xiao3', 'hua1', 'dan4']
四小花旦
