把之前的训练数据划分为 3 份，每次训练把其中的一份丢弃。训练三个不同的模型。之后再用这些模型来进行融合。

参考代码： 

https://github.com/brightmart/text_classification/blob/master/a02_TextCNN/p7_TextCNN_model.py 

https://github.com/Qinbf/Tensorflow/blob/master/Tensorflow%E5%9F%BA%E7%A1%80%E4%BD%BF%E7%94%A8%E4%B8%8E%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E5%BA%94%E7%94%A8/%E7%A8%8B%E5%BA%8F/cnn.ipynb

- 每个模型保存到对应的模型位置
- 每个模型生成一个 scores 分数矩阵用于后面进行模型融合

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from gensim.models import KeyedVectors
import pickle
import os
import sys
import shutil
import time


time0 = time.time()
print('Starting ...')
model_name = 'textcnn-fc-drop-title-content-256-345-cross3'                    # 模型名称
W_embedding = np.load('../data/W_embedding.npy')            # 导入预训练好的词向量

summary_path = '../summary/' + model_name + '/'             # summary 位置
if os.path.exists(summary_path):   # 删除原来的 summary 文件，避免重合
    print('removed the existing summary files.')
    shutil.rmtree(summary_path)
os.makedirs(summary_path)          # 然后再次创建
    
# ##################### config ######################
n_step1 = max_len1 = 50                   # title句子长度
n_step2= max_len2 = 150                   # content 长度
input_size = embedding_size = 256       # 字向量长度
n_class = 1999                          # 类别总数
filter_sizes = [3,4,5]                  # 卷积核大小
n_filter = 256                          # 每种卷积核的个数
fc_hidden_size = 1024                   # fc 层节点数
n_filter_total = n_filter * len(filter_sizes)
summary_step = 0
global_step = 0
valid_num = 100000
seed_num = 13
tr_batch_size = 128
te_batch_size = 128
print('Prepared, costed time %g s.' % (time.time() - time0))

Starting ...
removed the existing summary files.
Prepared, costed time 6.85836 s.


In [2]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from tensorflow.contrib import rnn
import tensorflow.contrib.layers as layers

'''
双端 GRU，知乎问题多标签分类。
'''
print('Building model ...')
lr = tf.placeholder(tf.float32)
keep_prob = tf.placeholder(tf.float32, [])
batch_size = tf.placeholder(tf.int32, [])  # 注意类型必须为 tf.int32
tst = tf.placeholder(tf.bool)
n_updates = tf.placeholder(tf.int32)      # training iteration,传入 bn 层
update_emas = list()   # BN 层中所有的更新操作


def weight_variable(shape, name):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)

def bias_variable(shape, name):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name=name)

def batchnorm(Ylogits, is_test, num_updates, offset, convolutional=False):
    """batchnormalization.
    Args:
        Ylogits: 1D向量或者是3D的卷积结果。
        num_updates: 迭代的global_step
        offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
        scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
        m: 表示batch均值；v:表示batch方差。
        bnepsilon：一个很小的浮点数，防止除以 0.
    Returns:
        Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
        update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
    """
    exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, num_updates) # adding the iteration prevents from averaging across non-existing iterations
    bnepsilon = 1e-5
    if convolutional:
        mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
    else:
        mean, variance = tf.nn.moments(Ylogits, [0])
    update_moving_everages = exp_moving_avg.apply([mean, variance])
    m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
    v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
    Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
    return Ybn, update_moving_everages


with tf.name_scope('Inputs'):
    X1_inputs = tf.placeholder(tf.int64, [None, n_step1], name='X1_input')
    X2_inputs = tf.placeholder(tf.int64, [None, n_step2], name='X2_input')
    y_inputs = tf.placeholder(tf.float32, [None, n_class], name='y_input')    

with tf.name_scope('embedding_layer'):
    embedding = tf.get_variable(name="W_embedding", shape=W_embedding.shape, 
                        initializer=tf.constant_initializer(W_embedding), trainable=True)   # fine-tune

def textcnn(X_inputs, n_step):
    """build the TextCNN network. Return the h_drop"""
    # X_inputs.shape = [batchsize, n_step]  ->  inputs.shape = [batchsize, n_step, embedding_size]
    inputs = tf.nn.embedding_lookup(embedding, X_inputs)  
    inputs = tf.expand_dims(inputs, -1)
    pooled_outputs = list()
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, n_filter]
            W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
#             tf.summary.histogram('W_filter', W_filter)
            beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[n_filter], name="beta"))
            tf.summary.histogram('beta', beta)
            conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
            conv_bn, update_ema = batchnorm(conv, tst, n_updates, beta, convolutional=True)    # 在激活层前面加 BN
            # Apply nonlinearity, batch norm scaling is not useful with relus
            # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
            h = tf.nn.relu(conv_bn, name="relu")
            # Maxpooling over the outputs
            pooled = tf.nn.max_pool(h,ksize=[1, n_step - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],padding='VALID',name="pool")
            pooled_outputs.append(pooled)
            update_emas.append(update_ema)
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, n_filter_total]) 
    return h_pool_flat    # shape = [-1, n_filter_total]
    
    
with tf.name_scope('cnn-title'):
    output_title = textcnn(X1_inputs, n_step1)
with tf.name_scope('cnn-content'):
    output_content = textcnn(X2_inputs, n_step2)
with tf.name_scope('fc-bn-layer'):
    output = tf.concat([output_title, output_content], axis=1)
    W_fc = weight_variable([n_filter_total*2, fc_hidden_size], name='Weight_fc')
    tf.summary.histogram('W_fc', W_fc)
    h_fc = tf.matmul(output, W_fc, name='h_fc')
    beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[fc_hidden_size], name="beta_fc"))
    tf.summary.histogram('beta_fc', beta_fc)
    fc_bn, update_ema_fc = batchnorm(h_fc, tst, n_updates, beta_fc, convolutional=False)
    update_emas.append(update_ema_fc)
    fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
    fc_bn_drop = tf.nn.dropout(fc_bn_relu, keep_prob)

with tf.name_scope('out_layer'):
    W_out = weight_variable([fc_hidden_size, n_class], name='Weight_out') 
    tf.summary.histogram('Weight_out', W_out)
    b_out = bias_variable([n_class], name='bias_out') 
    tf.summary.histogram('bias_out', b_out)
    y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  #每个类别的分数 scores
    
with tf.name_scope('cost'):
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y_inputs))
    tf.summary.scalar('cost', cost)

# 优化器
with tf.name_scope('AdamOptimizer'):
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_op = optimizer.minimize(cost)
    update_op = tf.group(*update_emas)   # 更新 BN 参数

# summary
merged = tf.summary.merge_all() # summary
train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
test_writer = tf.summary.FileWriter(summary_path + 'test')
print 'Finished creating the TextCNN model.'

Building model ...
Finished creating the TextCNN model.


## 导入数据

In [3]:
sys.path.append('..')
from data_helpers import BatchGenerator
from data_helpers import to_categorical
from evaluator import score_eval

save_path = '../data/'
print('loading data...')
time0 = time.time()
X_title = np.load(save_path+'X_tr_title_50.npy')
X_content = np.load(save_path+'X_tr_content_150.npy')
X = np.hstack([X_title, X_content])
y = np.load(save_path+'y_tr.npy')
print('finished loading data, time cost %g s' % (time.time() - time0))
# 划分验证集
sample_num = X.shape[0]
np.random.seed(seed_num)
new_index = np.random.permutation(sample_num)

X = X[new_index]
y = y[new_index]
X_valid = X[:valid_num]
y_valid = y[:valid_num]
X_train = X[valid_num:]
y_train = y[valid_num:]   
np.random.seed(10)
new_tr_index = np.random.permutation(len(X_train)) # 继续打乱训练集的顺序
X_train = X_train[new_tr_index]
y_train = y_train[new_tr_index]
# 构建数据生成器
data_valid = BatchGenerator(X_valid, y_valid, shuffle=False)
print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
print('X_valid.shape=', data_valid.X.shape, 'y_valid.shape=', data_valid.y.shape)

# valid 数据及 验证集计算
marked_labels_list = data_valid.y.tolist() # 所有的标注结果
valid_data_size = data_valid.y.shape[0]
def valid_epoch():
    """Testing or valid."""
    global global_step
    data_valid._index_in_epoch = 0  # 先指向第一个值
    _batch_size = tr_batch_size
    fetches = [cost, y_pred]   
    batch_num = int(valid_data_size / _batch_size)
    start_time = time.time()
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    for i in xrange(batch_num):
        X_batch, y_batch = data_valid.next_batch(_batch_size)
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        y_batch = to_categorical(y_batch)
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:1e-5,
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost    
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / batch_num
    return mean_cost, precision, recall, f1

print('Finised loading data, time %g s' % (time.time() - time0))

loading data...
finished loading data, time cost 61.1376 s
('X_train.shape=', (2899952, 200), 'y_train.shape=', (2899952,))
('X_valid.shape=', (100000, 200), 'y_valid.shape=', (100000,))
Finised loading data, time 76.744 s


##  模型训练

In [4]:
decay = 0.8             # 每个 epoch lr降低的系数
valid_step = 6000        # 每 valid_step 就进行一次 valid 运算
max_epoch = 1
max_max_epoch = 6       # 最多迭代的次数
save_epoch = 1           # 每迭代 save_epoch 次保存一次模型
saver = tf.train.Saver(max_to_keep=18)           # 最多保存的模型数量
sample_num = len(X_train)         # 所有训练样本的数量
K = 3   # 训练数据划分成 3 份
split_size = int(sample_num / K)  # 每份数据的大小  
summary_step = 0

In [5]:
def train(valid_k, max_max_epoch=7):
    """训练模型。
    把第 valid_k 份数据留出来作为验证集；其余的数据作为训练集。
    valid_k 取值为 0,1,...,K-1
    """
    global summary_step   # 一旦这个值在函数内部改变了，就认为它是内部变量，所以要加上 global
    global global_step
    global_step = 0
    te_start = valid_k*split_size
    te_end = min(te_start+split_size, sample_num)
    print('***Begin training, valid_k=%d, [%d,%d]' % (valid_k, te_start, te_end))
    tr_indexs = range(0,te_start) + range(te_end, sample_num)   # 训练数据的下标
    data_train = BatchGenerator(X_train[tr_indexs], y_train[tr_indexs],shuffle=True)
    model_path = '../ckpt/' + model_name    # 模型的保存位置
    if not os.path.exists(model_path):
        os.makedirs(model_path)         
    model_path = model_path + '/' + 'cross' + str(valid_k) + '.ckpt'
    tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
    sess.run(tf.global_variables_initializer())
    _lr = 5e-4
    for epoch in xrange(max_max_epoch):
        if (epoch >= max_epoch):
            _lr = _lr * decay
        print 'EPOCH %d， lr=%g' % (epoch+1, _lr)
        time0 = time.time()
        start_time = time.time()
        _costs = 0.0
        for batch in xrange(tr_batch_num): 
            global_step += 1
            summary_step += 1
            if (batch+1) % valid_step == 0:    # 进行 valid 计算
                valid_cost, precision, recall, f1 = valid_epoch()
                print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g ' % (
                        global_step, valid_cost, precision, recall, f1))
                if (epoch >= max_epoch):
                    _lr = _lr * decay
            X_batch, y_batch = data_train.next_batch(tr_batch_size)
            X1_batch = X_batch[:, :n_step1]
            X2_batch = X_batch[:, n_step1:]
            y_batch = to_categorical(y_batch)
            feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:_lr,
                         batch_size:tr_batch_size, keep_prob:0.5, tst:False, n_updates:global_step}
            fetches = [merged, cost, train_op, update_op]
            summary, _cost, _, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
            _costs += _cost
            if global_step % 100:  #　验证一次
                train_writer.add_summary(summary, summary_step)
                X_batch, y_batch = data_valid.next_batch(tr_batch_size)
                X1_batch = X_batch[:, :n_step1]
                X2_batch = X_batch[:, n_step1:]
                y_batch = to_categorical(y_batch)
                feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:1e-5,
                             batch_size:tr_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
                fetches = [merged, cost]
                summary, _cost = sess.run(fetches, feed_dict)
                test_writer.add_summary(summary, summary_step)
        valid_cost, precision, recall, f1 = valid_epoch()  # #每个 epoch 结束后进行一次整体 valid
        mean_cost = _costs / tr_batch_num
        print('Globel_step=%d. Training cost=%g; Valid cost=%g; p=%g, r=%g, f1=%g; Speed=%g s/epoch' % (
            global_step, mean_cost, valid_cost, precision, recall, f1, time.time()-time0) )
        if (epoch + 1) % save_epoch == 0:  # 每 2 个 epoch 保存一次模型
            model_save_path = saver.save(sess, model_path, global_step=(epoch+1))
            print('CKPT path is %s ' % model_save_path) 
    print('Finished training, valid_k=%d' % valid_k)


In [6]:
for valid_k in xrange(K):
    print('MODEL %d/%d' % (valid_k,K-1))
    train(valid_k, max_max_epoch=6)

MODEL 0/2
***Begin training, valid_k=0, [0,966650]
EPOCH 1， lr=0.0005
Global_step=6000: valid cost=0.00508873; p=1.16231, r=0.476933, f1=0.338171 
Globel_step=7551. Training cost=0.00901025; Valid cost=0.00490458; p=1.21397, r=0.496749, f1=0.352506; Speed=2800.03 s/epoch
CKPT path is ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-1 
EPOCH 2， lr=0.0004
Global_step=13551: valid cost=0.00446955; p=1.31376, r=0.536772, f1=0.381074 
Globel_step=15102. Training cost=0.00435597; Valid cost=0.00432388; p=1.32351, r=0.541196, f1=0.384123; Speed=2721.76 s/epoch
CKPT path is ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-2 
EPOCH 3， lr=0.000256
Global_step=21102: valid cost=0.00441031; p=1.34665, r=0.550044, f1=0.390531 
Globel_step=22653. Training cost=0.00395624; Valid cost=0.00421323; p=1.35773, r=0.554391, f1=0.393654; Speed=2558.2 s/epoch
CKPT path is ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-3 
EPOCH 4， lr=0.00016384
Global_step=286

## 没有收敛，继续训练

In [4]:
valid_step = 6000        # 每 valid_step 就进行一次 valid 运算
saver = tf.train.Saver(max_to_keep=45)           # 最多保存的模型数量
sample_num = len(X_train)         # 所有训练样本的数量
K = 3   # 训练数据划分成 3 份
split_size = int(sample_num / K)  # 每份数据的大小  
summary_step = 0
global_step = 0
valid_step = 6000
decay = 0.75

def add_train(valid_k, add_epoch=3, pre_model_num=6):
    """训练模型。
    把第 valid_k 份数据留出来作为验证集；其余的数据作为训练集。
    valid_k 取值为 0,1,...,K-1
    """
    global summary_step   # 一旦这个值在函数内部改变了，就认为它是内部变量，所以要加上 global
    global global_step
    global_step = 45306
    te_start = valid_k*split_size
    te_end = min(te_start+split_size, sample_num)
    print('***Begin training, valid_k=%d, [%d,%d]' % (valid_k, te_start, te_end))
    tr_indexs = range(0,te_start) + range(te_end, sample_num)   # 训练数据的下标
    data_train = BatchGenerator(X_train[tr_indexs], y_train[tr_indexs],shuffle=True)
    model_path = '../ckpt/' + model_name    # 模型的保存位置
    if not os.path.exists(model_path):
        os.makedirs(model_path)         
    model_path = model_path + '/' + 'cross' + str(valid_k) + '.ckpt'
    tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
    # 导入已经训练好的模型
    model_num = 6
    best_model_path = model_path + '-' + str(model_num)  # 导入最优模型
    saver.restore(sess, best_model_path)
    last_f1 = 0.402
    
    print('Finished loading model.')
    _lr = 1e-4
    for epoch in xrange(add_epoch):
        _lr = _lr * decay
        print 'EPOCH %d， lr=%g' % (epoch+1, _lr)
        time0 = time.time()
        _costs = 0.0
        for batch in xrange(tr_batch_num): 
            global_step += 1
            summary_step += 1
            if (batch+1) % valid_step == 0:    # 进行 valid 计算
                valid_cost, precision, recall, f1 = valid_epoch()
                print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g ' % (
                        global_step, valid_cost, precision, recall, f1))
                if (f1 > last_f1):
                    last_f1 = f1
                    model_num += 1
                    save_path = saver.save(sess, model_path, global_step=model_num)
                    print('the save path is ', save_path) 
                _lr = _lr * decay
                print('===>_lr=%g' % _lr)
            X_batch, y_batch = data_train.next_batch(tr_batch_size)
            X1_batch = X_batch[:, :n_step1]
            X2_batch = X_batch[:, n_step1:]
            y_batch = to_categorical(y_batch)
            feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:_lr,
                         batch_size:tr_batch_size, keep_prob:0.5, tst:False, n_updates:global_step}
            fetches = [merged, cost, train_op, update_op]
            summary, _cost, _, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
            _costs += _cost
            if global_step % 100:  #　验证一次
                train_writer.add_summary(summary, summary_step)
                X_batch, y_batch = data_valid.next_batch(tr_batch_size)
                X1_batch = X_batch[:, :n_step1]
                X2_batch = X_batch[:, n_step1:]
                y_batch = to_categorical(y_batch)
                feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:1e-5,
                             batch_size:tr_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
                fetches = [merged, cost]
                summary, _cost = sess.run(fetches, feed_dict)
                test_writer.add_summary(summary, summary_step)
        valid_cost, precision, recall, f1 = valid_epoch()  # #每个 epoch 结束后进行一次整体 valid
        mean_cost = _costs / tr_batch_num
        print('Globel_step=%d. Training cost=%g; Valid cost=%g; p=%g, r=%g, f1=%g; Speed=%g s/epoch' % (
            global_step, mean_cost, valid_cost, precision, recall, f1, time.time()-time0) )
        if (f1 > last_f1):
            last_f1 = f1
            model_num += 1
            save_path = saver.save(sess, model_path, global_step=model_num)
            print('the save path is ', save_path) 
    print('Finished training, valid_k=%d' % valid_k)


In [5]:
for valid_k in xrange(K):
    print('MODEL %d/%d' % (valid_k,K-1))
    add_train(valid_k)

MODEL 0/2
***Begin training, valid_k=0, [0,966650]
INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-6
Finished loading model.
EPOCH 1， lr=7.5e-05
Global_step=51306: valid cost=0.00417318; p=1.38143, r=0.563478, f1=0.400228 
===>_lr=5.625e-05
Global_step=57306: valid cost=0.00412471; p=1.38439, r=0.564188, f1=0.400834 
===>_lr=4.21875e-05
Globel_step=60409. Training cost=0.00350541; Valid cost=0.00409828; p=1.38915, r=0.566374, f1=0.402336; Speed=4837.94 s/epoch
('the save path is ', '../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-7')
EPOCH 2， lr=3.16406e-05
Global_step=66409: valid cost=0.00410749; p=1.38773, r=0.565415, f1=0.401734 
===>_lr=2.37305e-05
Global_step=72409: valid cost=0.00409201; p=1.38973, r=0.566048, f1=0.402221 
===>_lr=1.77979e-05
Globel_step=75512. Training cost=0.00340655; Valid cost=0.00408039; p=1.38975, r=0.566194, f1=0.402296; Speed=4882.3 s/epoch
EPOCH 3， lr=1.33484e-05
Global_step=815

KeyboardInterrupt: 

## 本地测试

In [7]:
sys.path.append('..')
from evaluator import score_eval

# X_valid = np.load('../data/X_valid.npy')
# marked_labels_list = np.load('../data/marked_labels_list.npy')
saver = tf.train.Saver()


def local_predict(best_model, local_scores_path):
    """预测  valid 结果，并保存预测概率 到  scores.csv 文件中。"""
    saver.restore(sess, best_model)
    time0 = time.time()
    X_te = X_valid
    n_sample = X_te.shape[0]  # 测试样本数量
    _batch_size = 256
    fetches = [y_pred]   
    predict_labels_list = list()  # 所有的预测结果
    predict_scores = list()
    for i in xrange(0, n_sample, _batch_size):
        start = i
        end = start+_batch_size
        if end > n_sample:
            end = n_sample
            _batch_size = end - start
        X_batch = X_te[start:end]
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, lr:1e-5,
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_scores = np.asarray(predict_scores)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('local valid p=%g, r=%g, f1=%g; speed=%g s/epoch' % ( precision, recall, f1, time.time()-time0) )
    np.save(local_scores_path, predict_scores)
    print('Writed the scores into %s, time %g s' % (local_scores_path, time.time()-time0))
    
best_epochs = [6,6,6]    # 每次 cross 训练中最好的迭代轮次
K = 3                    # 一共有三个模型
for k in xrange(K):
    time0 = time.time()
    print('***Local Predicting %d/%d' % (k,K))
    best_model = '../ckpt/' + model_name + '/cross' + str(k) +'.ckpt'+ '-' + str(best_epochs[k])  # 导入最优模型
    local_scores_path = '../local_scores/' + model_name + 'cross' + str(k) + '.npy'
    local_predict(best_model, local_scores_path)
    print('Finished cross%d, costed time %g s' % (k, time.time()-time0))
print('**Finished ALL.')

***Local Predicting 0/3
INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-6
local valid p=1.38878, r=0.565999, f1=0.402116; speed=38.8857 s/epoch
Writed the scores into ../local_scores/textcnn-fc-drop-title-content-256-345-cross3cross0.npy, time 39.674 s
Finished cross0, costed time 54.488 s
***Local Predicting 1/3
INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross1.ckpt-6
local valid p=1.38853, r=0.56587, f1=0.40203; speed=41.8943 s/epoch
Writed the scores into ../local_scores/textcnn-fc-drop-title-content-256-345-cross3cross1.npy, time 49.899 s
Finished cross1, costed time 63.6007 s
***Local Predicting 2/3
INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross2.ckpt-6
local valid p=1.38803, r=0.56593, f1=0.402018; speed=37.5196 s/epoch
Writed the scores into ../local_scores/textcnn-fc-drop-title-content-256-345-cross3cross2.npy, time

## 对测试数据进行预测

In [4]:
# 导入保存好的模型
from tqdm import tqdm
saver = tf.train.Saver()

# 导入测试数据
def predict(cross_num):
    """预测测试集结果，并保存到  result.csv 文件中。"""
    saver.restore(sess, '../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross'+str(cross_num)+'.ckpt-6')
    scores_path = '/home2/huangyongye/zhihu/scores/textcnn-fc-drop-title-content-256-345-cross3cross'+str(cross_num)+'.npy'
    X1_te = np.load('../data/X_te_title_50.npy')
    X2_te = np.load('../data/X_te_content_150.npy')
    X_te = np.hstack([X1_te, X2_te])
    n_sample = X_te.shape[0]  # 测试样本数量
    _batch_size = 500
    fetches = [y_pred]   
    predict_labels_list = list()  # 所有的预测结果
    predict_scores = list()
    for i in tqdm(xrange(0, n_sample, _batch_size)):
        start = i
        end = start+_batch_size
        if end > n_sample:
            end = n_sample
            _batch_size = end - start
        X_batch = X_te[start:end]
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, lr:1e-5,
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.asarray(predict_scores)
    predict_scores = np.vstack(predict_scores)
    np.save(scores_path, predict_scores)
    print('Finished saving the scores!predict_scores.shape=',predict_scores.shape)    

In [5]:
cross_nums = [0,1,2]    # 每次 cross 训练中最好的迭代轮次
for cross_num in cross_nums:
    predict(cross_num)

INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross0.ckpt-6


100%|██████████| 435/435 [00:19<00:00, 22.79it/s]


('Finished saving the scores!predict_scores.shape=', (217360, 1999))
INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross1.ckpt-6


100%|██████████| 435/435 [00:18<00:00, 24.23it/s]


('Finished saving the scores!predict_scores.shape=', (217360, 1999))
INFO:tensorflow:Restoring parameters from ../ckpt/textcnn-fc-drop-title-content-256-345-cross3/cross2.ckpt-6


100%|██████████| 435/435 [00:18<00:00, 24.04it/s]


('Finished saving the scores!predict_scores.shape=', (217360, 1999))


## 在全部预测正确的情况下，理论值为：f1=0.713933
precision=2.50273, recall=0.998873, f1=0.713933

In [26]:
# 假设全部正确，f1 值最高能到多少
def padding_label(labels):
    """把所有的label补齐到长度为 5"""
    label_len = len(labels)
    if label_len >= 5:
        return labels[:5]
    return np.hstack([labels, np.zeros(5-label_len, dtype=int) - 1])
    

marked_labels_list = data_valid.y.tolist() # 所有的标注结果
predict_labels_list = map(padding_label, marked_labels_list)
predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
print '在全部预测正确的情况下，理论值为：'
print 'precision=%g, recall=%g, f1=%g' % (precision, recall, f1)

在全部预测正确的情况下，理论值为：
precision=2.50273, recall=0.998873, f1=0.713933
