FastText, embedding 求均值；在后面 +fc+bn+dropout+sigmoid分类。

参考代码： 

https://github.com/brightmart/text_classification/blob/master/a01_FastText/p5_fastTextB_model.py

- 每个模型保存到对应的模型位置
- 每个模型生成一个 scores 分数矩阵用于后面进行模型融合

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from gensim.models import KeyedVectors
import pickle
import os
import sys
import shutil
import time


time0 = time.time()
print('Starting ...')
model_name = 'm1-fasttext-512'                    # 模型名称
W_embedding = np.load('../data/W_embedding.npy')            # 导入预训练好的词向量
model_path = '../ckpt/' + model_name + '/'                  # 模型保存位置
summary_path = '../summary/' + model_name + '/'             # summary 位置
result_path = '../result/' + model_name + '.csv'            # result.csv 位置
scores_path = '../scores/' + model_name + '.npy'            # scores.npy 位置
local_scores_path = '../local_scores/' + model_name + '.npy'


if not os.path.exists(model_path):
    os.makedirs(model_path)         
model_path = model_path + 'model.ckpt'
if os.path.exists(summary_path):   # 删除原来的 summary 文件，避免重合
    print('removed the existing summary files.')
    shutil.rmtree(summary_path)
os.makedirs(summary_path)          # 然后再次创建
    
# ##################### config ######################
n_step1 = max_len1 = 30                   # title句子长度
n_step2= max_len2 = 150                   # content 长度
input_size = embedding_size = 256       # 字向量长度
n_class = 1999                          # 类别总数
fc_hidden_size = 512                   # fc 层节点数
global_step = 0
valid_num = 100000
seed_num = 13
tr_batch_size = 128      
te_batch_size = 128 
print('Prepared, costed time %g s.' % (time.time() - time0))

Starting ...
removed the existing summary files.
Prepared, costed time 0.208622 s.


In [2]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from tensorflow.contrib import rnn
import tensorflow.contrib.layers as layers

'''
FastText，知乎问题多标签分类。
'''
print('Building model ...')
lr = tf.placeholder(tf.float32)
keep_prob = tf.placeholder(tf.float32, [])
batch_size = tf.placeholder(tf.int32, [])  # 注意类型必须为 tf.int32
tst = tf.placeholder(tf.bool)
n_updates = tf.placeholder(tf.int32)      # training iteration,传入 bn 层
update_emas = list()   # BN 层中所有的更新操作


def weight_variable(shape, name):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)

def bias_variable(shape, name):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial, name=name)

def batchnorm(Ylogits, is_test, num_updates, offset, convolutional=False):
    """batchnormalization.
    Args:
        Ylogits: 1D向量或者是3D的卷积结果。
        num_updates: 迭代的global_step
        offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
        scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
        m: 表示batch均值；v:表示batch方差。
        bnepsilon：一个很小的浮点数，防止除以 0.
    Returns:
        Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
        update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
    """
    exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, num_updates) # adding the iteration prevents from averaging across non-existing iterations
    bnepsilon = 1e-5
    if convolutional:
        mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
    else:
        mean, variance = tf.nn.moments(Ylogits, [0])
    update_moving_everages = exp_moving_avg.apply([mean, variance])
    m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
    v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
    Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
    return Ybn, update_moving_everages


with tf.name_scope('Inputs'):
    X1_inputs = tf.placeholder(tf.int64, [None, n_step1], name='X1_input')
    X2_inputs = tf.placeholder(tf.int64, [None, n_step2], name='X2_input')
    y_inputs = tf.placeholder(tf.float32, [None, n_class], name='y_input')    

with tf.name_scope('embedding_layer'):
    embedding = tf.get_variable(name="W_embedding", shape=W_embedding.shape, 
                        initializer=tf.constant_initializer(W_embedding), trainable=True)   # fine-tune

def fasttext(X_inputs):
    """ X_inputs->embedding->average->fc+bn+relu+dropout
    """
    # X_inputs.shape = [batchsize, n_step]  ->  inputs.shape = [batchsize, n_step, embedding_size]
    inputs = tf.nn.embedding_lookup(embedding, X_inputs)  
    with tf.name_scope('embedding_average'):
        inputs = tf.reduce_mean(inputs, axis=1)   # [batch_size, embedding_size]
    with tf.name_scope('fc_bn_relu'):
        W_fc = weight_variable([embedding_size, fc_hidden_size], name='Weight_fc')
        tf.summary.histogram('W_fc', W_fc)
        h_fc = tf.matmul(inputs, W_fc, name='h_fc')
        beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[fc_hidden_size], name="beta_fc"))
        tf.summary.histogram('beta_fc', beta_fc)
        fc_bn, update_ema_fc = batchnorm(h_fc, tst, n_updates, beta_fc, convolutional=False)
        update_emas.append(update_ema_fc)
        fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
        fc_bn_drop = tf.nn.dropout(fc_bn_relu, keep_prob)     
    return fc_bn_drop    # shape = [-1, fc_hidden_size]
    
    
with tf.name_scope('fast-title'):
    fast_title = fasttext(X1_inputs)
with tf.name_scope('fast-content'):
    fast_content = fasttext(X2_inputs)
with tf.name_scope('out_layer'):
    fast_output = tf.concat([fast_title, fast_content], axis=1)
    W_out = weight_variable([fc_hidden_size*2, n_class], name='Weight_out') 
    tf.summary.histogram('Weight_out', W_out)
    b_out = bias_variable([n_class], name='bias_out') 
    tf.summary.histogram('bias_out', b_out)
    y_pred = tf.nn.xw_plus_b(fast_output, W_out, b_out, name='y_pred')  #每个类别的分数 scores    
with tf.name_scope('cost'):
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y_inputs))
    tf.summary.scalar('cost', cost)

# 优化器
with tf.name_scope('AdamOptimizer'):
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_op = optimizer.minimize(cost)
    update_op = tf.group(*update_emas)   # 更新 BN 参数

# summary
merged = tf.summary.merge_all() # summary
train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
test_writer = tf.summary.FileWriter(summary_path + 'test')
print 'Finished creating the FastText model.'

Building model ...
Finished creating the FastText model.


## 导入数据

In [3]:
sys.path.append('..')
from data_helpers import BatchGenerator
from data_helpers import to_categorical
from evaluator import score_eval

save_path = '../data/'
print('loading data...')
time0 = time.time()
X_title = np.load(save_path+'X_tr_title_30.npy')
X_content = np.load(save_path+'X_tr_content_150.npy')
X = np.hstack([X_title, X_content])
y = np.load(save_path+'y_tr.npy')
print('finished loading data, time cost %g s' % (time.time() - time0))
# 划分验证集
sample_num = X.shape[0]
np.random.seed(seed_num)
new_index = np.random.permutation(sample_num)
X = X[new_index]
y = y[new_index]
X_valid = X[:valid_num]
y_valid = y[:valid_num]
X_train = X[valid_num:]
y_train = y[valid_num:]
print('train_num=%d, valid_num=%d' % (X_train.shape[0], X_valid.shape[0]))

# 构建数据生成器
data_train = BatchGenerator(X_train, y_train, shuffle=True)
data_valid = BatchGenerator(X_valid, y_valid, shuffle=False)
print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)

del X
del y
del X_train
del y_train


# valid 数据及 验证集计算
marked_labels_list = data_valid.y.tolist() # 所有的标注结果
valid_data_size = data_valid.y.shape[0]
def valid_epoch():
    """Testing or valid."""
    data_valid._index_in_epoch = 0  # 先指向第一个值
    _batch_size = te_batch_size
    fetches = [merged, cost, y_pred]   
    batch_num = int(valid_data_size / _batch_size)
    start_time = time.time()
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    for i in xrange(batch_num):
        X_batch, y_batch = data_valid.next_batch(_batch_size)
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        y_batch = to_categorical(y_batch)
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:1e-5,
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        summary, _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost    
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    test_writer.add_summary(summary, global_step)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / batch_num
    return mean_cost, precision, recall, f1

print('Finised loading data, time %g s' % (time.time() - time0))

loading data...
finished loading data, time cost 56.9227 s
train_num=2899952, valid_num=100000
('X_train.shape=', (2899952, 180), 'y_train.shape=', (2899952,))
('X_valid.shape=', (100000, 180), 'y_valid.shape=', (100000,))
Finised loading data, time 77.2545 s


In [None]:
# -*- coding:utf-8 -*- 

import tensorflow as tf
import numpy as np

"""m1-hcnn-2345-234"""

class Settings(object):
    def __init__(self):
        self.sent_len = 30
        self.doc_len = 10
        self.sent_filter_sizes = [2,3,4,5]
        self.doc_filter_sizes = [2,3,4]
        self.n_filter = 10
        self.keep_prob = 0.5
        self.n_class = 1999
        self.fc_hidden_size = 102


class HCNN:
    """
    title: inputs->textcnn->output_title
    content: inputs->hcnn->output_content
    concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, is_training, W_embedding, settings):
        self.sent_len = sent_len = settings.sent_len
        self.doc_len = doc_len = settings.doc_len
        self.sent_filter_sizes = sent_filter_sizes = settings.sent_filter_sizes
        self.doc_filter_sizes = doc_filter_sizes = settings.doc_filter_sizes
        self.n_filter = n_filter = settings.n_filter
        self.keep_prob = keep_prob = settings.keep_prob
        if not is_training:
            self.keep_prob = keep_prob = 1.0
        self.n_class = n_class = settings.n_class
        self.fc_hidden_size = fc_hidden_size = settings.fc_hidden_size
        self.is_training = is_training
        self.tst = tf.placeholder(tf.bool)
        self.n_updates = tf.placeholder(tf.int32)
        self.update_emas = list()
        self.batch_size = batch_size = tf.placeholder(tf.int32, [])
        with tf.name_scope('Inputs'):
            self.X1_inputs = tf.placeholder(tf.int64, [None, sent_len], name='X1_inputs')
            self.X2_inputs = tf.placeholder(tf.int64, [None, doc_len*sent_len], name='X2_inputs')
            self.y_inputs = tf.placeholder(tf.float32, [None, n_class], name='y_input') 
        with tf.name_scope('embedding'):
            self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 
                                initializer=tf.constant_initializer(W_embedding), trainable=True)  
        self.embedding_size = embed_size = W_embedding.shape[1]								
        with tf.name_scope('cnn_text'):
            output_title = self.cnn_reference(self.X1_inputs)
        with tf.name_scope('hcnn_content'):
            output_content = self.han_reference(self.X2_inputs)   
        with tf.name_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            output_size = n_filter*(len(sent_filter_sizes) + len(doc_filter_sizes))
            W_fc = self.weight_variable([output_size, fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
            fc_bn_drop = tf.nn.dropout(fc_bn_relu, self.keep_prob)
        with tf.name_scope('out_layer'):
            W_out = self.weight_variable([fc_hidden_size, n_class], name='Weight_out') 
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([n_class], name='bias_out') 
            tf.summary.histogram('bias_out', b_out)
            self.y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  #每个类别的分数 scores
        with tf.name_scope('loss'):
            self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.y_pred, labels=self.y_inputs))
            tf.summary.scalar('loss', self.loss)

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self.n_updates) # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def textcnn(self, X_inputs, n_step, filter_sizes, embed_size):
        """build the TextCNN network.
        n_step: the sentence len."""
        inputs = tf.expand_dims(X_inputs, -1)
        pooled_outputs = list()
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embed_size, 1, self.n_filter]
                W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
                beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta"))
                tf.summary.histogram('beta', beta)
                conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)    # 在激活层前面加 BN
                # Apply nonlinearity, batch norm scaling is not useful with relus
                # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
                h = tf.nn.relu(conv_bn, name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h,ksize=[1, n_step - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],padding='VALID',name="pool")
                pooled_outputs.append(pooled)
                self.update_emas.append(update_ema)
        h_pool = tf.concat(pooled_outputs, 3)
        n_filter_total = self.n_filter * len(filter_sizes)
        h_pool_flat = tf.reshape(h_pool, [-1, n_filter_total])
        return h_pool_flat    # shape = [-1, n_filter_total]

    def cnn_reference(self, X_inputs):
        """TextCNN 模型。title部分。
        Args:
            X_inputs: tensor.shape=(batch_size, title_len)
        Returns:
            title_outputs: tensor.shape=(batch_size, n_filter*filter_num_sent)
        """
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        with tf.variable_scope('title_encoder'):  # 生成 title 的向量表示
            title_outputs = self.textcnn(inputs, self.sent_len, self.sent_filter_sizes, embed_size=self.embedding_size)
        return title_outputs   # shape = [batch_size, n_filter*filter_num_sent]

    def han_reference(self, X_inputs):
        """分层 TextCNN 模型。content部分。
        Args:
            X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
        Returns:
            doc_attn_outputs: tensor.shape=(batch_size, n_filter*filter_num_doc)
        """
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)    # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
        sent_inputs = tf.reshape(inputs,[self.batch_size*self.doc_len, self.sent_len, self.embedding_size]) # [batch_size*doc_len, sent_len, embedding_size]
        with tf.variable_scope('sentence_encoder'):  # 生成句向量
            sent_outputs = self.textcnn(sent_inputs, self.sent_len, self.sent_filter_sizes, self.embedding_size)
        with tf.variable_scope('doc_encoder'):  # 生成文档向量
            doc_inputs = tf.reshape(sent_outputs, [self.batch_size, self.doc_len, self.n_filter*len(self.sent_filter_sizes)]) # [batch_size, doc_len, n_filter*len(filter_sizes_sent)]
            doc_outputs = self.textcnn(doc_inputs, self.doc_len, self.doc_filter_sizes, self.n_filter*len(self.sent_filter_sizes))  # [batch_size, doc_len, n_filter*filter_num_doc]
        return doc_outputs    # [batch_size,  n_filter*len(doc_filter_sizes)]


# test the model 
# def test():
#     settings = Settings()
#     is_training = True
#     W_embedding = np.load('../../data/W_embedding.npy')
#     config = tf.ConfigProto()
#     config.gpu_options.allow_growth = True
#     batch_size = 128
#     summary_path = 'test_summary/'
#     with tf.Session(config=config) as sess:
#         hcnn = HCNN(is_training, W_embedding, settings)
#         optimizer = tf.train.AdamOptimizer(0.001)
#         train_op = optimizer.minimize(hcnn.loss)
#         update_op = tf.group(*hcnn.update_emas)
#         merged = tf.summary.merge_all() # summary
#         train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
#         test_writer = tf.summary.FileWriter(summary_path + 'test')
#         sess.run(tf.global_variables_initializer())
#         fetch = [hcnn.loss, hcnn.y_pred, train_op, update_op]
#         loss_list = list()
#         for i in xrange(100):
#             global_step = i
#             X1_batch = np.zeros((batch_size, 30), dtype=float)
#             X2_batch = np.zeros((batch_size, 10*30), dtype=float)
#             y_batch = np.zeros((batch_size, 1999), dtype=int)
#             _batch_size = len(y_batch)
#             feed_dict = {hcnn.X1_inputs: X1_batch, hcnn.X2_inputs: X2_batch, hcnn.y_inputs: y_batch,
#             hcnn.batch_size: _batch_size, hcnn.n_updates: global_step, hcnn.tst: not is_training}
#             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
#             loss_list.append(loss)
#             print(i, loss)

# test()





##  模型训练

In [4]:
_lr = 2e-4
decay = 0.65             # 每个 epoch lr降低的系数
valid_step = 8000        # 每 valid_step 就进行一次 valid 运算
max_epoch = 1
max_max_epoch = 6       # 最多迭代的次数
tr_batch_num = int(data_train.y.shape[0] / tr_batch_size)  # 每个 epoch 中包含的 batch 数
print('tr_batch_num=%d' % tr_batch_num)
saver = tf.train.Saver(max_to_keep=3)           # 最多保存的模型数量
sess.run(tf.global_variables_initializer())
last_f1 = 0.40
model_num = 0

for epoch in xrange(max_max_epoch):
    if epoch >= max_epoch:
        _lr = _lr * decay
    print 'EPOCH %d， lr=%g' % (epoch+1, _lr)
    time0 = time.time()
    start_time = time.time()
    _costs = 0.0
    for batch in xrange(tr_batch_num): 
        global_step += 1
        if (batch+1) % valid_step == 0:    # 进行 valid 计算
            valid_cost, precision, recall, f1 = valid_epoch()
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g ' % (
                    global_step, valid_cost, precision, recall, f1))
            if (f1 > last_f1):
                last_f1 = f1
                model_num += 1
                save_path = saver.save(sess, model_path, global_step=model_num)
                print('the save path is ', save_path) 
            if epoch >= max_epoch:
                _lr = _lr * decay
                print('===>_lr=%g' % _lr)
        X_batch, y_batch = data_train.next_batch(tr_batch_size)
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        y_batch = to_categorical(y_batch)
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:_lr,
                     batch_size:tr_batch_size, keep_prob:0.8, tst:False, n_updates:global_step}
        fetches = [merged, cost, train_op, update_op]
        summary, _cost, _, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
        _costs += _cost
        if global_step % 100:  #　验证一次
            train_writer.add_summary(summary, global_step)
            X_batch, y_batch = data_valid.next_batch(tr_batch_size)
            X1_batch = X_batch[:, :n_step1]
            X2_batch = X_batch[:, n_step1:]
            y_batch = to_categorical(y_batch)
            feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch,  y_inputs:y_batch, lr:1e-5,
                         batch_size:tr_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
            fetches = [merged, cost]
            summary, _cost = sess.run(fetches, feed_dict)
            test_writer.add_summary(summary, global_step)
    valid_cost, precision, recall, f1 = valid_epoch()  # #每个 epoch 结束后进行一次整体 valid
    mean_cost = _costs / tr_batch_num
    print('Global_step=%d; training cost=%g; valid cost=%g; p=%g, r=%g, f1=%g; speed=%g s/epoch' % (
        global_step, mean_cost, valid_cost, precision, recall, f1, time.time()-time0) )
    if (f1 > last_f1):
        last_f1 = f1
        model_num += 1
        save_path = saver.save(sess, model_path, global_step=model_num)
        print('the save path is ', save_path)  

tr_batch_num=22655
EPOCH 1， lr=0.0002
Global_step=8000: valid cost=0.00498839; p=1.09176, r=0.452515, f1=0.319915 
Global_step=16000: valid cost=0.00447502; p=1.24505, r=0.510586, f1=0.362094 
Global_step=22655; training cost=0.00709909; valid cost=0.00434789; p=1.29103, r=0.528798, f1=0.375142; speed=6902.36 s/epoch
EPOCH 2， lr=0.00013
Global_step=30655: valid cost=0.00429197; p=1.31786, r=0.539106, f1=0.382596 
===>_lr=8.45e-05
Global_step=38655: valid cost=0.00419256; p=1.33442, r=0.546142, f1=0.387535 
===>_lr=5.4925e-05
Global_step=45310; training cost=0.00421866; valid cost=0.00416467; p=1.34334, r=0.549342, f1=0.389898; speed=6962.6 s/epoch
EPOCH 3， lr=3.57013e-05
Global_step=53310: valid cost=0.00414769; p=1.34816, r=0.550959, f1=0.391118 
===>_lr=2.32058e-05


KeyboardInterrupt: 

In [5]:
valid_cost, precision, recall, f1 = valid_epoch()  # #每个 epoch 结束后进行一次整体 valid
print('valid cost=%g; p=%g, r=%g, f1=%g; speed=%g s/epoch' % (valid_cost, precision, recall, f1, time.time()-time0) )
save_path = saver.save(sess, model_path, global_step=(epoch+1))
print('the save path is %s ' % save_path) 

valid cost=0.00400637; p=1.42095, r=0.579586, f1=0.411671; speed=7431.4 s/epoch
the save path is ../ckpt/textcnn-fc-drop-title-content-256-23457/model.ckpt-6 


- Bi-LSTM 模型<br/>
**batch_size=256，迭代12个epoch，基本收敛。结果： 验证集 f1=0.38618； 提交结果 0.3873186**
- Bi-GRU 模型<br/>
**batch_size=256，迭代15个epoch。大概在 13 个 epoch 就收敛了。结果： 验证集 f1=0.390534； 提交结果 0.39198**
- attention-Bi-GRU 模型<br/>
**batch_size=256，迭代18个epoch。在16个epoch收敛。结果：验证集 f1=f1=0.391734 ；提交结果 0.39310**


- textcnn-256 lr=1e-3, decay=0.65, dropout=0.5, 迭代6次基本收敛， f1=0.388
- textcnn-256-bn lr=1.5E-3，decay=0.65, dropout=0.5, 6次基本收敛，8次好一点， f1=0.389
- textcnn-fc-drop 迭代 4.5 个epoch 收敛。

In [5]:
model_num += 1
save_path = saver.save(sess, model_path, global_step=model_num)
print('the save path is ', save_path)  

('the save path is ', '../ckpt/m1-fasttext-512/model.ckpt-1')


## 若没有收敛，继续迭代

In [None]:
_lr=2e-4
global_step=50185
add_epoch = 2   # 增加迭代的次数
for epoch in xrange(max_max_epoch,max_max_epoch+add_epoch):
    fetches = [merged, cost, train_op]
    _lr = _lr * decay
    print 'EPOCH %d， lr=%g' % (epoch+1, _lr)
    time0 = time.time()
    start_time = time.time()
    _costs = 0.0
    show_costs = 0.0
    for batch in xrange(tr_batch_num): 
        global_step += 1
        if (batch+1) % valid_step == 0:    # 进行 valid 计算
            valid_cost, precision, recall, f1 = valid_epoch()
            print('global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g ' % (
                    global_step+1, valid_cost, precision, recall, f1))
        X_batch, y_batch = data_train.next_batch(tr_batch_size)
        y_batch = to_categorical(y_batch)
        feed_dict = {X_inputs:X_batch, y_inputs:y_batch, lr:_lr, batch_size:tr_batch_size, keep_prob:0.5, tst:False, n_updates:global_step}
        summary, _cost, _ = sess.run(fetches, feed_dict) # the cost is the mean cost of one batch
        _costs += _cost
        if global_step % 100:
            train_writer.add_summary(summary, global_step)
    valid_cost, precision, recall, f1 = valid_epoch()  # # 每个 epoch 进行一次验证 valid
    mean_cost = _costs / tr_batch_num
    print('training cost=%g; valid cost=%g; p=%g, r=%g, f1=%g; speed=%g s/epoch' % (
        mean_cost, valid_cost, precision, recall, f1, time.time()-time0) )
    if (epoch + 1) % save_epoch == 0:  # 每 2 个 epoch 保存一次模型
        save_path = saver.save(sess, model_path, global_step=(epoch+1))
        print('the save path is ', save_path) 

## 本地测试
使用 seed13 的前 10万条数据.

In [17]:
np.save('../data/X_valid.npy', X_valid)

In [10]:
# 保存 本地测试的标注数据
np.save('../data/marked_labels_list.npy', marked_labels_list)

In [6]:
local_scores_path = '../local_scores/' + model_name + '.npy'

In [13]:
# 导入保存好的模型
# saver = tf.train.Saver()
# best_model_path = model_path + '-' + str(8)  # 导入最优模型
# saver.restore(sess, best_model_path)
# print('Finished loading model.')

# 导入测试数据
def local_predict(scores_path=local_scores_path):
    """预测  valid 结果，并保存预测概率 到  scores.csv 文件中。"""
    print('local predicting ...')
    time0 = time.time()
    X_te = X_valid
    n_sample = X_te.shape[0]  # 测试样本数量
    _batch_size = 200
    fetches = [y_pred]   
    predict_labels_list = list()  # 所有的预测结果
    predict_scores = list()
    for i in xrange(0, n_sample, _batch_size):
        start = i
        end = start+_batch_size
        if end > n_sample:
            end = n_sample
            _batch_size = end - start
        X_batch = X_te[start:end]
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, lr:1e-5,
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_scores = np.asarray(predict_scores)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('local valid p=%g, r=%g, f1=%g; speed=%g s/epoch' % ( precision, recall, f1, time.time()-time0) )
    np.save(local_scores_path, predict_scores)
    print('Writed the scores into %s, time %g s' % (local_scores_path, time.time()-time0))
    
local_predict()

local predicting ...
local valid p=1.42094, r=0.579544, f1=0.411649; speed=57.3602 s/epoch
Writed the scores into ../local_scores/textcnn-fc-drop-title-content-256-23457.npy, time 58.9522


## 对测试数据进行预测

In [14]:
# 导入保存好的模型
# saver = tf.train.Saver()
# best_model_path = model_path + '-' + str(8)  # 导入最优模型
# saver.restore(sess, best_model_path)
# print('Finished loading model.')

# 导入测试数据
def predict():
    """预测测试集结果，并保存到  result.csv 文件中。"""
    X1_te = np.load('../data/X_te_title_50.npy')
    X2_te = np.load('../data/X_te_content_150.npy')
    X_te = np.hstack([X1_te, X2_te])
    n_sample = X_te.shape[0]  # 测试样本数量
    _batch_size = 200
    fetches = [y_pred]   
    predict_labels_list = list()  # 所有的预测结果
    predict_scores = list()
    for i in xrange(0, n_sample, _batch_size):
        start = i
        end = start+_batch_size
        if end > n_sample:
            end = n_sample
            _batch_size = end - start
        X_batch = X_te[start:end]
        X1_batch = X_batch[:, :n_step1]
        X2_batch = X_batch[:, n_step1:]
        feed_dict = {X1_inputs:X1_batch, X2_inputs:X2_batch, lr:1e-5,
                     batch_size:_batch_size, keep_prob:1.0, tst:True, n_updates:global_step}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_scores = np.asarray(predict_scores)
    return predict_labels_list, np.vstack(predict_scores)

def write_result(predict_labels_list, result_path):
    """把结果写到 result.csv 中"""
    eval_question = np.load('../data/eval_question.npy')
    with open('../data/sr_topic2id.pkl', 'rb') as inp:
        sr_topic2id = pickle.load(inp)
        sr_id2topic = pickle.load(inp)
    pred_labels = np.asarray(predict_labels_list).reshape([-1])
    pred_topics = sr_id2topic[pred_labels].values.reshape([-1, 5])   # 转为 topic
    df_result = pd.DataFrame({'question':eval_question, 'tid0': pred_topics[:,0], 'tid1':pred_topics[:, 1],
                         'tid2': pred_topics[:,2], 'tid3':pred_topics[:,3],'tid4': pred_topics[:,4]})
    df_result.to_csv(result_path, index=False, header=False)
    print('Finished writing the result')
    return df_result

In [15]:
%time predict_labels_list,predict_scores = predict()
df_result = write_result(predict_labels_list, result_path=result_path) 
print('len(df_result)=',len(df_result))  # 结果应该为 217360
print('Saving the predict_scores into %s' % scores_path)
print('predict_scores.shape=',predict_scores.shape)
np.save(scores_path, predict_scores)
print('Finished saving the result!')

CPU times: user 1min 51s, sys: 1min 17s, total: 3min 8s
Wall time: 3min 1s
Finished writing the result
('len(df_result)=', 217360)
Saving the predict_scores into ../scores/textcnn-fc-drop-title-content-256-23457.npy
('predict_scores.shape=', (217360, 1999))
Finished saving the result!


In [16]:
df_result.head(5)

Unnamed: 0,question,tid0,tid1,tid2,tid3,tid4
0,6215603645409872328,-7506384235581390893,4610596224687453206,2919247920214845195,-5932391056759866388,-6306904715218704629
1,6649324930261961840,-240041917918953337,2858911571784840089,3418451812342379591,2382911985227044227,3383016985780045156
2,-4251899610700378615,2919247920214845195,-5265476641576484497,-3315241959305847628,-7358589937244777363,-429636223750539488
3,6213817087034420233,-8655945395761165989,-4966205278807386328,5804619920623030604,7476760589625268543,244937959911721367
4,-8930652370334418373,3972493657017129406,-8963554618409314978,-1115593437686158905,7951349602759061249,1870872991887862017


In [9]:
# 参考结果
df_result.head(5)

Unnamed: 0,question,tid0,tid1,tid2,tid3,tid4
0,6215603645409872328,4610596224687453206,-6839713564940654454,-6306904715218704629,2919247920214845195,-8091907016971478715
1,6649324930261961840,3418451812342379591,2858911571784840089,2382911985227044227,-240041917918953337,3383016985780045156
2,-4251899610700378615,2919247920214845195,-7358589937244777363,2816249700493135244,-5265476641576484497,-3315241959305847628
3,6213817087034420233,-8655945395761165989,5804619920623030604,-4966205278807386328,7476760589625268543,-2523521411748733187
4,-8930652370334418373,3972493657017129406,-8963554618409314978,-1115593437686158905,1870872991887862017,6018641953300645757


## 在全部预测正确的情况下，理论值为：f1=0.713933
precision=2.50273, recall=0.998873, f1=0.713933

In [26]:
# 假设全部正确，f1 值最高能到多少
def padding_label(labels):
    """把所有的label补齐到长度为 5"""
    label_len = len(labels)
    if label_len >= 5:
        return labels[:5]
    return np.hstack([labels, np.zeros(5-label_len, dtype=int) - 1])
    

marked_labels_list = data_valid.y.tolist() # 所有的标注结果
predict_labels_list = map(padding_label, marked_labels_list)
predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
print '在全部预测正确的情况下，理论值为：'
print 'precision=%g, recall=%g, f1=%g' % (precision, recall, f1)

在全部预测正确的情况下，理论值为：
precision=2.50273, recall=0.998873, f1=0.713933
