分层的双端 GRU 模型。title部分不变，content部分使用分层。每部分取长度为 30。只用一层的 bigru.

两部分的输出用 fc+bn 层拼接。 不加 dropout

原来从 1.5e-3开始降，loss降得很快，但是到了 2e-4 开始就不降了。

现在把 _lr 的下降速度调慢一点。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import pickle
import os
import sys
import shutil
import time

time0 = time.time()
print('Starting ...')
model_name = 'p4-1-han-bigru-256'                    # 模型名称
W_embedding = np.load('../data/W_embedding.npy').astype(np.float32)            # 导入预训练好的词向量
_W_out = np.load('../data/W_out_bigru.npy').astype(np.float32)                 # 使用之前 bigru 模型中保存的权值进行初始化  
model_path = '../ckpt/' + model_name + '/'                  # 模型保存位置
summary_path = '../summary/' + model_name + '/'             # summary 位置
result_path = '../result/' + model_name + '.csv'            # result.csv 位置
scores_path = '../scores/' + model_name + '.npy'            # scores.npy 位置
save_path = '../data/'

if not os.path.exists(model_path):
    os.makedirs(model_path)         
model_path = model_path + 'model.ckpt'
if os.path.exists(summary_path):   # 删除原来的 summary 文件，避免重合
    shutil.rmtree(summary_path)
os.makedirs(summary_path)          # 然后再次创建

# ##################### config ######################
# title 部分 attention-bigru-512 
title_len = n_step1 = 30                  # title 长度
# content 部分 han-bigru-512
doc_len = 10                   # 每个文档的句子数
sent_len = 30                 # 每个句子的词数
# 其他参数
input_size = embedding_size = 256       # 字向量长度
n_class = 1999
hidden_size = 256    # 隐含层节点数
fc_hidden_size = 1024                   # fc 层节点数
title_attention_size = sent_attention_size = doc_attention_size = hidden_size * 2
n_layer = 1        # bi-gru 层数
max_grad_norm = 1.0  # 最大梯度（超过此值的梯度将被裁剪）
global_step = 0
valid_num = 100000
seed_num = 13
te_batch_size = 128 
tr_batch_size = 128 
print('Prepared, costed time %g s.' % (time.time() - time0))

Starting ...
Prepared, costed time 5.7184 s.


In [2]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from tensorflow.contrib import rnn
import tensorflow.contrib.layers as layers

'''
HAN 模型，知乎问题多标签分类。
'''

print('Building model ...')
lr = tf.placeholder(tf.float32)
keep_prob = tf.placeholder(tf.float32, [])
batch_size = tf.placeholder(tf.int32, [])  # 注意类型必须为 tf.int32
tst = tf.placeholder(tf.bool)
n_updates = tf.placeholder(tf.int32)      # training iteration,传入 bn 层
update_emas = list()   # BN 层中所有的更新操作


def weight_variable(shape, name):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)

def bias_variable(shape, name):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name=name)

def batchnorm(Ylogits, is_test, num_updates, offset, convolutional=False):
    """batchnormalization.
    Args:
        Ylogits: 1D向量或者是3D的卷积结果。
        num_updates: 迭代的global_step
        offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
        scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
        m: 表示batch均值；v:表示batch方差。
        bnepsilon：一个很小的浮点数，防止除以 0.
    Returns:
        Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
        update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
    """
    exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, num_updates) # adding the iteration prevents from averaging across non-existing iterations
    bnepsilon = 1e-5
    if convolutional:
        mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
    else:
        mean, variance = tf.nn.moments(Ylogits, [0])
    update_moving_everages = exp_moving_avg.apply([mean, variance])
    m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
    v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
    Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
    return Ybn, update_moving_everages

def gru_cell():
    with tf.variable_scope('gru_cell'):
        cell = rnn.GRUCell(hidden_size, reuse=tf.get_variable_scope().reuse)
    return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
  
def bi_gru(inputs, seg_num=batch_size):
    """build the bi-GRU network. Return the encoder represented vector.
    n_step: 句子的词数量；或者文档的句子数。
    seg_num: 序列的数量，原本应该为 batch_size, 但是这里将 batch_size 个 doc展开成很多个句子。
    """
    cells_fw = [gru_cell() for _ in range(n_layer)]
    cells_bw = [gru_cell() for _ in range(n_layer)]
    initial_states_fw = [cell_fw.zero_state(seg_num, tf.float32) for cell_fw in cells_fw]
    initial_states_bw = [cell_bw.zero_state(seg_num, tf.float32) for cell_bw in cells_bw] 
    outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, 
                        initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32)
    # outputs: Output Tensor shaped: seg_num, max_time, layers_output]，其中layers_output=hidden_size * 2 在这里。
    return outputs
    
    
def task_specific_attention(inputs, output_size,
                            initializer=layers.xavier_initializer(),
                            activation_fn=tf.tanh, scope=None):
    """
    Performs task-specific attention reduction, using learned
    attention context vector (constant within task of interest).
    Args:
        inputs: Tensor of shape [batch_size, units, input_size]
            `input_size` must be static (known)
            `units` axis will be attended over (reduced from output)
            `batch_size` will be preserved
        output_size: Size of output's inner (feature) dimension
    Returns:
        outputs: Tensor of shape [batch_size, output_dim].
    """
    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
    with tf.variable_scope(scope or 'attention') as scope:
        # u_w, attention 向量
        attention_context_vector = tf.get_variable(name='attention_context_vector',
                                                   shape=[output_size],
                                                   initializer=initializer,
                                                   dtype=tf.float32)
        # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
        input_projection = layers.fully_connected(inputs, output_size,
                                                  activation_fn=activation_fn,
                                                  scope=scope)
        # 输出 [batch_size, units]
        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
        attention_weights = tf.nn.softmax(vector_attn, dim=1)   
        tf.summary.histogram('attention_weigths', attention_weights)
        weighted_projection = tf.multiply(inputs, attention_weights)   # ???源代码,原文应该是对 hi 加权！！
        outputs = tf.reduce_sum(weighted_projection, axis=1)
        return outputs  # 输出 [batch_size, units]


with tf.name_scope('Inputs'):
    X1_inputs = tf.placeholder(tf.int64, [None, title_len], name='X1_inputs')
    X2_inputs = tf.placeholder(tf.int64, [None, doc_len*sent_len], name='X2_inputs')
    y_inputs = tf.placeholder(tf.float32, [None, n_class], name='y_input')    

# 第一次定义，放在 CPU 上面
with tf.device('/gpu:0'):
    with tf.variable_scope('embedding'):
        embedding = tf.get_variable(name="W_embedding", shape=W_embedding.shape, 
                        initializer=tf.constant_initializer(W_embedding), trainable=True)   # fine-tune
    
def bigru_reference(X_inputs):
    """双端 gru 模型。title部分。
    Args: 
        X_inputs: tensor.shape=(batch_size, title_len)
    Returns:
        title_outputs: tensor.shape=(batch_size, hidden_size(*2 for bigru))
    """
    inputs = tf.nn.embedding_lookup(embedding, X_inputs)
    with tf.variable_scope('title_encoder'):  # 生成 title 的向量表示
        title_outputs = bi_gru(inputs)
        title_attn_outputs = task_specific_attention(title_outputs, title_attention_size)   
    return title_attn_outputs   # shape = [batch_size, hidden_size*2]
    
def han_reference(X_inputs):
    """分层 attention 模型。content部分。
    Args: 
        X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
    Returns:
        doc_attn_outputs: tensor.shape=(batch_size, hidden_size(*2 for bigru))
    """
    inputs = tf.nn.embedding_lookup(embedding, X_inputs)    # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
    sent_inputs = tf.reshape(inputs,[batch_size*doc_len, sent_len, embedding_size]) # shape=(?, 40, 256)
    with tf.variable_scope('sentence_encoder'):  # 生成句向量
        sent_outputs = bi_gru(sent_inputs, seg_num=batch_size*doc_len)
        sent_attn_outputs = task_specific_attention(sent_outputs, sent_attention_size) # [batch_size*doc_len, hidden_size*2]
        with tf.variable_scope('dropout'):
            sent_attn_outputs = tf.nn.dropout(sent_attn_outputs, keep_prob)
    with tf.variable_scope('doc_encoder'):  # 生成句向量
        doc_inputs = tf.reshape(sent_attn_outputs, [batch_size, doc_len, hidden_size*2])
        doc_outputs = bi_gru(doc_inputs)  # [batch_size, doc_len, hidden_size*2]
        doc_attn_outputs = task_specific_attention(doc_outputs, doc_attention_size) # [batch_size, hidden_size*2]
    return doc_attn_outputs    # [batch_size, hidden_size*2]
        

with tf.name_scope('title_bigru'):
    output_title = bigru_reference(X1_inputs)
with tf.name_scope('content_han'):
    output_content = han_reference(X2_inputs)   
    
with tf.name_scope('fc-bn-layer'):
    output = tf.concat([output_title, output_content], axis=1)
    W_fc = weight_variable([hidden_size*4, fc_hidden_size], name='Weight_fc')
    tf.summary.histogram('W_fc', W_fc)
    h_fc = tf.matmul(output, W_fc, name='h_fc')
    beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[fc_hidden_size], name="beta_fc"))
    tf.summary.histogram('beta_fc', beta_fc)
    fc_bn, update_ema_fc = batchnorm(h_fc, tst, n_updates, beta_fc, convolutional=False)
    update_emas.append(update_ema_fc)
    fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
    
with tf.name_scope('out_layer'):
    W_out = tf.Variable(_W_out, name='Weight_out') 
    tf.summary.histogram('Weight_out', W_out)
    b_out = bias_variable([n_class], name='bias_out') 
    tf.summary.histogram('bias_out', b_out)
    y_pred = tf.nn.xw_plus_b(fc_bn_relu, W_out, b_out, name='y_pred')  #每个类别的分数 scores
    
with tf.name_scope('cost'):
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y_inputs))
    tf.summary.scalar('cost', cost)
    
# ***** 优化求解 *******
with tf.name_scope('AdamOptimizer1'):
    tvars1 = tf.trainable_variables()
    grads1, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars1), max_grad_norm)
    optimizer1 = tf.train.AdamOptimizer(learning_rate=lr)
    train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
        global_step=tf.contrib.framework.get_or_create_global_step())
    
with tf.name_scope('AdamOptimizer2'):
    tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
    grads2, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars2), max_grad_norm)
    optimizer2 = tf.train.AdamOptimizer(learning_rate=lr)
    train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
        global_step=tf.contrib.framework.get_or_create_global_step())
update_op = tf.group(*update_emas)   # 更新 BN 参数

 
merged = tf.summary.merge_all() # summary
train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
test_writer = tf.summary.FileWriter(summary_path + 'test')
print 'Finished creating the han-bi-gru model.'

Building model ...
Finished creating the han-bi-gru model.


## 导入数据

In [3]:
import sys
sys.path.append('..')
from data_helpers import BatchGenerator
from data_helpers import to_categorical
from evaluator import score_eval


print('loading data...')
time0 = time.time()
X_title = np.load(save_path+'X_tr_title_30.npy') 
sample_num = X_title.shape[0]
X_content = np.load(save_path+'train_segs30_content.npy')
X_content = X_content.reshape([-1, doc_len*sent_len])
X = np.hstack([X_title, X_content])
y = np.load(save_path+'y_tr.npy')
print('finished loading data, time cost %g s' % (time.time() - time0))

# 划分验证集
np.random.seed(seed_num)
new_index = np.random.permutation(sample_num)
X = X[new_index]
y = y[new_index]
X_valid = X[:valid_num]
y_valid = y[:valid_num]
X_train = X[valid_num:]
y_train = y[valid_num:]
print('train_num=%d, valid_num=%d' % (X_train.shape[0], X_valid.shape[0]))

# 构建数据生成器
data_train = BatchGenerator(X_train, y_train, shuffle=True)
data_valid = BatchGenerator(X_valid, y_valid, shuffle=False)
print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)

marked_labels_list = data_valid.y.tolist() # 所有的标注结果
valid_data_size = data_valid.y.shape[0]

def valid_epoch():
    """Testing or valid."""
    data_valid._index_in_epoch = 0  # 先指向第一个值
    _batch_size = te_batch_size
    fetches = [cost, y_pred]   
    batch_num = int(valid_data_size / _batch_size)
    start_time = time.time()
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    for i in xrange(batch_num):
        X_batch, y_batch = data_valid.next_batch(_batch_size)
        X1_batch = X_batch[:, :title_len]
        X2_batch = X_batch[:, title_len:]
        y_batch = to_categorical(y_batch)
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, y_inputs:y_batch, lr:1e-5, 
                    batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost    
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / batch_num
    return mean_cost, precision, recall, f1

print('Finised loading data, time %g s' % (time.time() - time0))

loading data...
finished loading data, time cost 67.0299 s
train_num=2899952, valid_num=100000
('X_train.shape=', (2899952, 330), 'y_train.shape=', (2899952,))
('X_valid.shape=', (100000, 330), 'y_valid.shape=', (100000,))
Finised loading data, time 75.6208 s


## 训练模型

In [5]:
print(0.0015 * (0.85**8))
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
print('tr_batch_num=%d' % tr_batch_num)

0.000408735787559
tr_batch_num=22655


In [6]:
# 测试用
# max_epoch = 2
# max_max_epoch = 3  # 本例中，
# decay = 0.85
# decay_step = 140
# valid_step = 70

# 正式用
max_epoch = 2
max_max_epoch = 8  # 本例中，
decay = 0.85
decay_step = 10000
valid_step = 6000

_lr = 1.5e-3
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
print('tr_batch_num=%d' % tr_batch_num)
saver = tf.train.Saver(max_to_keep=3)           # 最多保存的模型数量
sess.run(tf.global_variables_initializer())
last_f1 = 0.40
model_num = 0
global_step = 0

tr_batch_num=22655


In [None]:
train_op = train_op2

for epoch in xrange(max_max_epoch):
    if epoch == max_epoch:
        train_op = train_op1
        if model_num == 0:
            model_num += 1
            save_path = saver.save(sess, model_path, global_step=model_num)
            print('the save path is ', save_path)
        print('Begin updating embedding.')
    print 'EPOCH %d, _lr= %g' % (epoch+1, _lr)
    time0 = time.time()
    for batch in xrange(tr_batch_num): 
        global_step += 1
        if (global_step+1) % decay_step == 0:
            _lr = _lr * decay
        if (global_step+1) % valid_step == 0:    # 进行 valid 计算
            valid_cost, precision, recall, f1 = valid_epoch()
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g ' % (
                    global_step, valid_cost, precision, recall, f1))
            if (f1 > last_f1):
                last_f1 = f1
                model_num += 1
                save_path = saver.save(sess, model_path, global_step=model_num)
                print('the save path is ', save_path)
        X_batch, y_batch = data_train.next_batch(tr_batch_size)
        X1_batch = X_batch[:, :title_len]
        X2_batch = X_batch[:, title_len:]
        y_batch = to_categorical(y_batch)
        feed_dict = {X1_inputs:X1_batch,X2_inputs:X2_batch, y_inputs:y_batch, lr:_lr,
                    batch_size:tr_batch_size, keep_prob:0.5, tst:False, n_updates:global_step}
        fetches = [merged, cost, train_op, update_op]
        summary, _cost, _, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
        if global_step % 100:
            train_writer.add_summary(summary, global_step)
            X_batch, y_batch = data_valid.next_batch(tr_batch_size)
            X1_batch = X_batch[:, :title_len]
            X2_batch = X_batch[:, title_len:]
            y_batch = to_categorical(y_batch)
            feed_dict = {X1_inputs:X1_batch,X2_inputs:X2_batch, y_inputs:y_batch, lr:1e-5,
                         batch_size:tr_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
            fetches = [merged, cost]
            summary, _cost = sess.run(fetches, feed_dict)
            test_writer.add_summary(summary, global_step)
            
valid_cost, precision, recall, f1 = valid_epoch()  # # 每个 epoch 进行一次验证 valid
print('Global_step=%d; valid cost=%g; p=%g, r=%g, f1=%g; speed=%g s/epoch' % (
    global_step, valid_cost, precision, recall, f1, time.time()-time0) )
if (f1 > last_f1):
    last_f1 = f1
    model_num += 1
    save_path = saver.save(sess, model_path, global_step=model_num)
    print('the save path is ', save_path)       

EPOCH 1, _lr= 0.0015


```
EPOCH 1, _lr= 0.0015
Global_step=5999: valid cost=0.00444487; p=1.20799, r=0.496638, f1=0.351944 
Global_step=11999: valid cost=0.00424074; p=1.27425, r=0.521651, f1=0.370129 
Global_step=17999: valid cost=0.00414744; p=1.29897, r=0.531801, f1=0.377323 
EPOCH 2, _lr= 0.00108375
Global_step=23999: valid cost=0.00411525; p=1.31289, r=0.537002, f1=0.381116 
Global_step=29999: valid cost=0.0040816; p=1.32376, r=0.541566, f1=0.384332 
Global_step=35999: valid cost=0.00404897; p=1.33708, r=0.546395, f1=0.387886 
Global_step=41999: valid cost=0.00403789; p=1.34181, r=0.54814, f1=0.389164 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-1')
Begin updating embedding.
EPOCH 3, _lr= 0.000783009
Global_step=47999: valid cost=0.00404364; p=1.35205, r=0.551544, f1=0.391741 
Global_step=53999: valid cost=0.00396575; p=1.37155, r=0.559419, f1=0.39735 
Global_step=59999: valid cost=0.00397487; p=1.37862, r=0.561797, f1=0.399143 
Global_step=65999: valid cost=0.00390154; p=1.3926, r=0.568243, f1=0.403568 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-2')
EPOCH 4, _lr= 0.000565724
```

In [4]:
# 导入保存好的模型
saver = tf.train.Saver()
best_model_path = model_path + '-' + str(2)  # 导入最优模型
saver.restore(sess, best_model_path)
print('Finished loading model.')

valid_cost, precision, recall, f1 = valid_epoch()  # # 每个 epoch 进行一次验证 valid
print('valid cost=%g; p=%g, r=%g, f1=%g; speed=%g s/epoch' % (
     valid_cost, precision, recall, f1, time.time()-time0) )

INFO:tensorflow:Restoring parameters from ../ckpt/p4-1-han-bigru-256/model.ckpt-2
Finished loading model.
valid cost=0.00390154; p=1.3926, r=0.568243, f1=0.403568; speed=218.605 s/epoch


In [6]:
print(0.0015 * (0.85**6))
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
print('tr_batch_num=%d' % tr_batch_num)

0.000565724273437
tr_batch_num=22655


## 若没有收敛，继续迭代

In [7]:
# 正式用
add_epoch = 5  # 本例中，
decay = 0.85
decay_step = 10000
valid_step = 6000

_lr = 1.5e-3*(0.85**5)
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
print('tr_batch_num=%d' % tr_batch_num)
saver = tf.train.Saver(max_to_keep=3)           # 最多保存的模型数量
last_f1 = 0.403568
model_num = 0
global_step = 66000

tr_batch_num=22655


In [8]:
train_op = train_op1

for epoch in xrange(add_epoch):
    print 'EPOCH %d, _lr= %g' % (epoch+1, _lr)
    time0 = time.time()
    for batch in xrange(tr_batch_num): 
        global_step += 1
        if (global_step+1) % decay_step == 0:
            _lr = _lr * decay
        if (global_step+1) % valid_step == 0:    # 进行 valid 计算
            valid_cost, precision, recall, f1 = valid_epoch()
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g ' % (
                    global_step, valid_cost, precision, recall, f1))
            if (f1 > last_f1):
                last_f1 = f1
                model_num += 1
                save_path = saver.save(sess, model_path, global_step=model_num)
                print('the save path is ', save_path)
        X_batch, y_batch = data_train.next_batch(tr_batch_size)
        X1_batch = X_batch[:, :title_len]
        X2_batch = X_batch[:, title_len:]
        y_batch = to_categorical(y_batch)
        feed_dict = {X1_inputs:X1_batch,X2_inputs:X2_batch, y_inputs:y_batch, lr:_lr,
                    batch_size:tr_batch_size, keep_prob:0.5, tst:False, n_updates:global_step}
        fetches = [merged, cost, train_op, update_op]
        summary, _cost, _, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
        if global_step % 100:
            train_writer.add_summary(summary, global_step)
            X_batch, y_batch = data_valid.next_batch(tr_batch_size)
            X1_batch = X_batch[:, :title_len]
            X2_batch = X_batch[:, title_len:]
            y_batch = to_categorical(y_batch)
            feed_dict = {X1_inputs:X1_batch,X2_inputs:X2_batch, y_inputs:y_batch, lr:1e-5,
                         batch_size:tr_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
            fetches = [merged, cost]
            summary, _cost = sess.run(fetches, feed_dict)
            test_writer.add_summary(summary, global_step)
            
valid_cost, precision, recall, f1 = valid_epoch()  # # 每个 epoch 进行一次验证 valid
print('Global_step=%d; valid cost=%g; p=%g, r=%g, f1=%g; speed=%g s/epoch' % (
    global_step, valid_cost, precision, recall, f1, time.time()-time0) )
if (f1 > last_f1):
    last_f1 = f1
    model_num += 1
    save_path = saver.save(sess, model_path, global_step=model_num)
    print('the save path is ', save_path)       

EPOCH 1, _lr= 0.000665558
Global_step=71999: valid cost=0.00393059; p=1.39609, r=0.569329, f1=0.40441 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-1')
Global_step=77999: valid cost=0.00390302; p=1.39862, r=0.570582, f1=0.405254 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-2')
Global_step=83999: valid cost=0.00391087; p=1.40086, r=0.571489, f1=0.4059 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-3')
EPOCH 2, _lr= 0.000480866
Global_step=89999: valid cost=0.00390977; p=1.40363, r=0.572246, f1=0.406514 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-4')
Global_step=95999: valid cost=0.00393081; p=1.40286, r=0.571853, f1=0.406251 
Global_step=101999: valid cost=0.00391683; p=1.40578, r=0.573226, f1=0.407188 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-5')
Global_step=107999: valid cost=0.00390987; p=1.40846, r=0.573996, f1=0.407802 
('the save path is ', '../ckpt/p4-1-han-bigru-256/model.ckpt-6')
EPOCH 3, _lr=

KeyboardInterrupt: 

## 本地测试

In [9]:
# sys.path.append('..')
# from evaluator import score_eval

# print('begin ...')
# # X_valid = np.load('../data/X_valid.npy')
# # 保存 本地测试的标注数据
# marked_labels_list = np.load('../data/marked_labels_list.npy')
# local_scores_path = '../local_scores/' + model_name + '.npy'

# 导入保存好的模型
saver = tf.train.Saver()
best_model_path = model_path + '-' + str(6)  # 导入最优模型
saver.restore(sess, best_model_path)
print('Finished loading model.')

INFO:tensorflow:Restoring parameters from ../ckpt/p4-1-han-bigru-256/model.ckpt-6
Finished loading model.


In [10]:
local_scores_path = '../local_scores/' + model_name + '.npy'
global_step = 800000
# 导入测试数据
def local_predict(scores_path=local_scores_path):
    """预测  valid 结果，并保存预测概率 到  scores.csv 文件中。"""
    print('local predicting ...')
    time0 = time.time()
    X_te = X_valid
    n_sample = X_te.shape[0]  # 测试样本数量
    _batch_size = 500
    fetches = [y_pred]   
    predict_labels_list = list()  # 所有的预测结果
    predict_scores = list()
    for i in xrange(0, n_sample, _batch_size):
        start = i
        end = start+_batch_size
        if end > n_sample:
            end = n_sample
            _batch_size = end - start
        X_batch = X_te[start:end]
        X1_batch = X_batch[:, :title_len]
        X2_batch = X_batch[:, title_len:]
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, lr:1e-5, 
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_scores = np.asarray(predict_scores)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('local valid p=%g, r=%g, f1=%g; speed=%g s/epoch' % ( precision, recall, f1, time.time()-time0) )
    np.save(local_scores_path, predict_scores)
    print('Writed the scores into %s, time %g s' % (local_scores_path, time.time()-time0))
    
local_predict()

local predicting ...
local valid p=1.40846, r=0.573955, f1=0.407782; speed=88.0718 s/epoch
Writed the scores into ../local_scores/p4-1-han-bigru-256.npy, time 88.4801 s


## 对测试数据进行预测

In [15]:
# 导入保存好的模型
# saver = tf.train.Saver()
# best_model_path = model_path + '-' +str(6)
# saver.restore(sess, best_model_path)

INFO:tensorflow:Restoring parameters from ../ckpt/han-bigru-title-content-256/model.ckpt-4


In [11]:
save_path = '../data/'

In [13]:
# 导入测试数据
from tqdm import tqdm

def predict():
    """预测测试集结果，并保存到  result.csv 文件中。"""
    X_title = np.load(save_path+'X_te_title_30.npy') 
    X_content = np.load(save_path+'eval_segs30_content.npy')
    X_content = X_content.reshape([-1, doc_len*sent_len])
    X_te = np.hstack([X_title, X_content])
    n_sample = X_te.shape[0]  # 测试样本数量
    _batch_size = 1000
    fetches = [y_pred]   
    predict_labels_list = list()  # 所有的预测结果
    predict_scores = list()
    for i in tqdm(xrange(0, n_sample, _batch_size)):
        start = i
        end = start+_batch_size
        if end > n_sample:
            end = n_sample
            _batch_size = end - start
        X_batch = X_te[start:end]
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, lr:1e-5,
                    batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_scores = np.asarray(predict_scores)
    return predict_labels_list, np.vstack(predict_scores)

def write_result(predict_labels_list, result_path):
    """把结果写到 result.csv 中"""
    eval_question = np.load('../data/eval_question.npy')
    with open('../data/sr_topic2id.pkl', 'rb') as inp:
        sr_topic2id = pickle.load(inp)
        sr_id2topic = pickle.load(inp)
    pred_labels = np.asarray(predict_labels_list).reshape([-1])
    pred_topics = sr_id2topic[pred_labels].values.reshape([-1, 5])   # 转为 topic
    df_result = pd.DataFrame({'question':eval_question, 'tid0': pred_topics[:,0], 'tid1':pred_topics[:, 1],
                         'tid2': pred_topics[:,2], 'tid3':pred_topics[:,3],'tid4': pred_topics[:,4]})
    df_result.to_csv(result_path, index=False, header=False)
    print('Finished writing the result')
    return df_result

In [14]:
%time predict_labels_list,predict_scores = predict()
df_result = write_result(predict_labels_list, result_path=result_path) 
print('len(df_result)=',len(df_result))  # 结果应该为 217360
print('Saving the predict_scores into %s' % scores_path)
print('predict_scores.shape=',predict_scores.shape)
np.save(scores_path, predict_scores)
print('Finished saving the result!')

100%|██████████| 218/218 [02:03<00:00,  2.08it/s]


CPU times: user 1min 19s, sys: 29.2 s, total: 1min 48s
Wall time: 2min 18s
Finished writing the result
('len(df_result)=', 217360)
Saving the predict_scores into ../scores/p4-1-han-bigru-256.npy
('predict_scores.shape=', (217360, 1999))
Finished saving the result!


In [15]:
df_result.head(15)

Unnamed: 0,question,tid0,tid1,tid2,tid3,tid4
0,6215603645409872328,4610596224687453206,-7506384235581390893,-6839713564940654454,-6306904715218704629,5695691654483839948
1,6649324930261961840,-240041917918953337,2858911571784840089,3418451812342379591,2382911985227044227,3383016985780045156
2,-4251899610700378615,2919247920214845195,-7358589937244777363,8570540777066461619,-5265476641576484497,5695691654483839948
3,6213817087034420233,5804619920623030604,-8655945395761165989,-4966205278807386328,248035954018101491,8804742921198031305
4,-8930652370334418373,-8963554618409314978,3972493657017129406,-1115593437686158905,6018641953300645757,-6925670792665757873
5,-2893445476547593888,-6206436693745657677,-8274522839089381384,2131451573312950491,-2696736445927423374,5707939385348112587
6,2614833994648160978,4482402820945758152,-6748914495015758455,-3856154743789177934,-3904210248050890128,1553849747662134917
7,1572988006266661060,-2627298052801704596,3738968195649774859,7739004195693774975,-839691564858654120,1160326435131345730
8,-3736249911643942320,-7653703019053330516,8690965822342756180,5502528845814007324,1127459907694805235,-129145008166137909
9,-976507019126932319,3195914392210930723,3804601920633030746,6718676536613592056,6940355838132160535,1757481363245650831


In [12]:
# 参考结果
df_result.head(5)

Unnamed: 0,question,tid0,tid1,tid2,tid3,tid4
0,6215603645409872328,4610596224687453206,-7506384235581390893,-6306904715218704629,-6839713564940654454,2919247920214845195
1,6649324930261961840,-240041917918953337,2858911571784840089,3418451812342379591,-7483543763655495143,2382911985227044227
2,-4251899610700378615,2919247920214845195,-7358589937244777363,2816249700493135244,-7270992690764838239,-5265476641576484497
3,6213817087034420233,-8655945395761165989,-4966205278807386328,5804619920623030604,7476760589625268543,244937959911721367
4,-8930652370334418373,3972493657017129406,-8963554618409314978,-1115593437686158905,6018641953300645757,7951349602759061249


## 在全部预测正确的情况下，理论值为：f1=0.713933
precision=2.50273, recall=0.998873, f1=0.713933

In [26]:
# 假设全部正确，f1 值最高能到多少
def padding_label(labels):
    """把所有的label补齐到长度为 5"""
    label_len = len(labels)
    if label_len >= 5:
        return labels[:5]
    return np.hstack([labels, np.zeros(5-label_len, dtype=int) - 1])
    

marked_labels_list = data_valid.y.tolist() # 所有的标注结果
predict_labels_list = map(padding_label, marked_labels_list)
predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
print '在全部预测正确的情况下，理论值为：'
print 'precision=%g, recall=%g, f1=%g' % (precision, recall, f1)

在全部预测正确的情况下，理论值为：
precision=2.50273, recall=0.998873, f1=0.713933
