### 多标签模型定义

In [2]:
import tensorflow as tf
import numpy as np

In [3]:
class TextCNN:
    def __init__(self, num_classes, batch_size, vocab_size, embed_size, sentence_len, 
                 learning_rate, decay_step, decay_rate, filter_num, filter_sizes):
        #1.定义超参数
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.sentence_len = sentence_len
        self.learning_rate = learning_rate
        self.filter_num = filter_num
        self.filter_sizes = filter_sizes #list，如[2,3,4],表示3个卷积核的长度（height）
        self.filter_num_total = filter_num * len(filter_sizes)
        self.initializer = tf.random_normal_initializer(stddev=0.1)
        
        #epoch信息
        self.global_epoch = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_epoch') #在指数衰减函数中会加一
        self.epoch_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='epoch_step')
        self.epoch_increment = tf.assign(self.epoch_step, self.epoch_step+tf.constant(1))
        self.decay_step = decay_step
        self.decay_rate = decay_rate
        
        #2.设置输入
        self.sentence = tf.placeholder(dtype=tf.int32, shape=[None, self.sentence_len], name='sentence')
        #self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label')
        self.label_l1999 = tf.placeholder(dtype=tf.float32, shape=[None, self.num_classes], name='label_l1999')
        self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
        #self.dropout_keep_prob = 0.5
        
        #3.参数初始化
        self.instantiate_weight()
        #4.定义图
        self.logits = self.inference()
        
        #5.定义loss和train_op
        self.loss_val = self.loss()
        self.train_op = self.train()
        
#         #6.预测，计算准确率
#         self.prediction = tf.argmax(self.logits, axis=1, name='prediction')
#         correct_pre = tf.equal(tf.cast(self.prediction, tf.int32), self.label)
#         self.accuracy = tf.reduce_mean(tf.cast(correct_pre, tf.float32))
    
    def instantiate_weight(self):
        self.Embedding = tf.get_variable('Embedding', [self.vocab_size, self.embed_size], tf.float32, initializer=self.initializer)
        self.W = tf.get_variable('weight', [self.filter_num_total, self.num_classes], tf.float32, initializer=self.initializer)
        self.b = tf.get_variable('b', [self.num_classes], dtype=tf.float32)
        
    def inference(self):
        #embedding -- 卷积 -- 线性分类器
        self.sentece_embedding = tf.nn.embedding_lookup(self.Embedding, self.sentence)
        h = self.cnn_single_layer()
        logits = tf.matmul(h, self.W) + self.b
        return logits
    
    def cnn_single_layer(self):
        #conv2d -- BN -- ReLU -- max_pooling -- dropout -- dense
        #conv2d的输入与卷积核都要求是4维的，具体查看文档
        sentece_embedding_4d = tf.expand_dims(self.sentece_embedding, -1) #增加一维，[batch_size, sentence_len, embed_size, 1]
        pool_output = []
        for filter_size in self.filter_sizes:
            with tf.variable_scope('convolution-pooling-%d'%filter_size):
                ft = tf.get_variable('filter%d'%filter_size, [filter_size, self.embed_size, 1, self.filter_num], 
                                     tf.float32, initializer=self.initializer)
                conv = tf.nn.conv2d(sentece_embedding_4d, ft, strides=[1,1,1,1], padding='VALID')
                conv = tf.contrib.layers.batch_norm(conv) #[batch_size, sentence_len-filter_size+1, 1, filter_num]
                activation = tf.nn.relu(conv)
                
                pooled = tf.nn.max_pool(activation, ksize=[1,self.sentence_len-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID')
                pool_output.append(pooled) #若干个shape=[batch_size, 1, 1, filter_num]
        pool_concat = tf.concat(pool_output, axis=3) #在第三维拼接
        flatten_pool = tf.reshape(pool_concat, [-1, self.filter_num_total])
        
        dropouted = tf.nn.dropout(flatten_pool, keep_prob=self.dropout_keep_prob)
        h = tf.layers.dense(dropouted, self.filter_num_total, activation=tf.nn.tanh)
        return h
        
    def loss(self, l2_lambda=0.001):
        loss1 = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.label_l1999, logits=self.logits)
        loss1 = tf.reduce_mean(tf.reduce_sum(loss1, axis=1))
        l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name], name='l2_loss') * l2_lambda
        loss = loss1 + l2_loss
        return loss
                
    def train(self):
        learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_epoch, 
                                                   self.decay_step, self.decay_rate, staircase=True)
        train_op = tf.contrib.layers.optimize_loss(self.loss_val, self.global_epoch, learning_rate, optimizer='Adam')
        return train_op

In [4]:
def test():
    num_classes=10
    learning_rate=0.01
    batch_size=5
    decay_step=1000
    decay_rate=0.9
    sequence_length=5
    vocab_size=10000
    embed_size=100
    is_training=True
    dropout_keep_prob=0.5
    
    model = TextCNN(num_classes, batch_size, vocab_size, embed_size, sequence_length,
                     learning_rate, decay_step, decay_rate, 50, [2,3,4])
    print(tf.trainable_variables())
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        input_x = np.random.randint(0,100,size=(batch_size, sequence_length),dtype=np.int32)
        input_y = np.random.randint(0, 2,size=(batch_size, num_classes), dtype=np.int32)
        for i in range(20):
            #input_x = np.zeros((batch_size, sequence_length), dtype=np.int32)
            #input_y = np.array([1,0,1,1,1,2,1,1], dtype=np.int32)
            loss, logits, _ = sess.run([model.loss_val, model.logits, model.train_op],
                                            feed_dict={model.sentence: input_x, model.label_l1999: input_y,
                                                       model.dropout_keep_prob: dropout_keep_prob})
            logits = np.argsort(logits)
            print('loss:',loss, 'label:', input_y, 'pre:', logits)
            
    

In [5]:
tf.reset_default_graph()
test()

[<tf.Variable 'Embedding:0' shape=(10000, 100) dtype=float32_ref>, <tf.Variable 'weight:0' shape=(150, 10) dtype=float32_ref>, <tf.Variable 'b:0' shape=(10,) dtype=float32_ref>, <tf.Variable 'convolution-pooling-2/filter2:0' shape=(2, 100, 1, 50) dtype=float32_ref>, <tf.Variable 'convolution-pooling-2/BatchNorm/beta:0' shape=(50,) dtype=float32_ref>, <tf.Variable 'convolution-pooling-3/filter3:0' shape=(3, 100, 1, 50) dtype=float32_ref>, <tf.Variable 'convolution-pooling-3/BatchNorm/beta:0' shape=(50,) dtype=float32_ref>, <tf.Variable 'convolution-pooling-4/filter4:0' shape=(4, 100, 1, 50) dtype=float32_ref>, <tf.Variable 'convolution-pooling-4/BatchNorm/beta:0' shape=(50,) dtype=float32_ref>, <tf.Variable 'dense/kernel:0' shape=(150, 150) dtype=float32_ref>, <tf.Variable 'dense/bias:0' shape=(150,) dtype=float32_ref>]
loss: 13.15637 label: [[0 0 0 0 0 1 1 0 1 0]
 [1 0 0 0 1 1 1 0 1 1]
 [1 1 1 0 1 0 0 1 0 1]
 [1 0 1 0 1 1 0 1 0 1]
 [0 1 1 0 1 1 0 1 1 1]] pre: [[5 0 2 6 9 3 7 1 8 4]
 [0

### 训练

In [6]:
import sys
import tensorflow as tf
import numpy as np
from tflearn.data_utils import to_categorical, pad_sequences
import os
import pickle
import h5py

In [7]:
tf.reset_default_graph()
#define hyperparameter
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_integer('label_size', 1999, 'number of label')
tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size for training')

tf.app.flags.DEFINE_integer('sentence_len', 200, 'length of each sentence')
tf.app.flags.DEFINE_integer('embed_size', 100, 'embedding size')
tf.app.flags.DEFINE_float('learning_rate', 0.0003, '')
tf.app.flags.DEFINE_float('decay_rate', 1, '')
tf.app.flags.DEFINE_integer('decay_steps', 1000, 'number of steps before decay learning rate')
tf.app.flags.DEFINE_bool('is_training', True, '')

tf.app.flags.DEFINE_integer('num_epoch', 15, '')
tf.app.flags.DEFINE_integer('validation_every', 1, 'Validate every validate_every epochs.')
tf.app.flags.DEFINE_string("ckpt_dir","textcnn_multilabel_checkpoint/","checkpoint location for the model")
tf.app.flags.DEFINE_string("cache_path","textcnn_multilabel_checkpoint/data_cache.pik","data chche for the model")

tf.app.flags.DEFINE_integer("num_filters", 128, "number of filters") #256--->512

In [11]:
import time
def log(str):
    t = time.localtime()
    print("[%4d/%02d/%02d %02d:%02d:%02d]"%(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec), end=' ')
    print(str)

In [12]:
#define main

#process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data & training (4.validation) 

def main(_):
    #1.加载数据
    base_path = '/data/chenhy/data/ieee_zhihu_cup/'
    cache_file_h5py = base_path + 'data.h5'
    cache_file_pickle = base_path + 'vocab_label.pik'
    word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y = load_data(cache_file_h5py, cache_file_pickle)
    
    index2word = {index: word for word, index in word2index.items()}
    index2label = {index: label for label, index in label2index.items()}
    vocab_size = len(word2index)

    #print("train_X.shape:", np.array(train_X).shape)
    #print("train_y.shape:", np.array(train_y).shape)
    print("test_X.shape:", np.array(test_X).shape)  # 每个list代表一句话
    print("test_y.shape:", np.array(test_y).shape)  
    #print("test_X[0]:", test_X[0])  
    #print("test_X[1]:", test_X[1])
    #print("test_y[0]:", test_y[0])  

    #2.创建session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = TextCNN(FLAGS.label_size, FLAGS.batch_size, vocab_size, 
                        FLAGS.embed_size, FLAGS.sentence_len, FLAGS.learning_rate, 
                        FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_filters, [2,3,4])
        saver = tf.train.Saver()
        batch_size = FLAGS.batch_size
        CONTINUE_TRAIN = False
        if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
            print('restore model from checkpoint')
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            print('CONTINUE_TRAIN=', CONTINUE_TRAIN)
            sess.run(model.epoch_increment)
            print('Continue at Epoch:', sess.run(model.epoch_step))
        if not os.path.exists(FLAGS.ckpt_dir + 'checkpoint') or CONTINUE_TRAIN:
            if not os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
                print('initialize variables')
                sess.run(tf.global_variables_initializer())
                #print('assign pre-trained embedding')
                #embedding_assign = tf.assign(model.Embedding, tf.constant(np.array(embedding_final))) #为model.Embedding赋值
                #sess.run(embedding_assign)

            #3.训练
            num_of_data = len(train_y)
            for _ in range(FLAGS.num_epoch):
                epoch = sess.run(model.epoch_step)
                loss, counter = 0.0, 0
                for start, end in zip(range(0, num_of_data, batch_size), range(batch_size, num_of_data, batch_size)):
                    if (epoch == 0 and counter == 0):
                        print('train_X[start, end]:', train_X[start:end])
                        print('train_y[start, end]:', train_y[start:end])
                    l,_ = sess.run([model.loss_val, model.train_op], 
                                feed_dict={model.sentence: train_X[start:end], model.label_l1999: train_y[start:end],
                                           model.dropout_keep_prob: 0.5})
                    loss, counter = loss+l, counter+1

                    if (counter % 100 == 0):
                        log("Epoch %d\Batch %d\ Train Loss:%.3f"%(epoch, counter, loss/float(counter)))

                    if counter % 3000 == 0:
                        print('run model on validation data...')
                        loss_valid, f1_score, precision, recall = do_eval(sess, model, vaild_X, valid_y)
                        log("Epoch %d/ Validation Loss:%.3f/ F1_score:%.3f/ Precision:%.3f/ Recall:%.3f"%(epoch, loss_valid, f1_score, precision, recall))
                        #save the checkpoint
                        save_path = FLAGS.ckpt_dir + 'model.ckpt'
                        saver.save(sess, save_path, global_step=model.epoch_step)
                sess.run(model.epoch_increment)
        loss_valid, f1_score, precision, recall = do_eval(sess, model, vaild_X, valid_y)
        log("Epoch %d/ Validation Loss:%.3f/ F1_score:%.3f/ Precision:%.3f/ Recall:%.3f"%(epoch, loss_valid, f1_score, precision, recall))
                        

def load_data(h5_file_path, pik_file_path):
    if not os.path.exists(h5_file_path) or not os.path.exists(pik_file_path):
        raise RuntimeError('No such file!!')

    print('cache files exist, going to load in...')
    print('loading h5_file...')
    h5_file = h5py.File(h5_file_path, 'r')
    print('h5_file.keys:', h5_file.keys())
    train_X, train_y = h5_file['train_X'], h5_file['train_Y']
    vaild_X, valid_y = h5_file['vaild_X'], h5_file['valid_Y']
    test_X,  test_y  = h5_file['test_X'],  h5_file['test_Y']
    #embedding_final = h5_file['embedding']

    print('loading pickle file')
    word2index, label2index = None, None
    with open(pik_file_path, 'rb') as pkl:
        word2index,label2index = pickle.load(pkl)
    print('cache files load successful!')
    return word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y

def do_eval(sess, model, test_X, test_y):
    test_X, text_y = test_X[:3000], test_y[:3000]
    num_of_data = len(test_y)
    batch_size = 1
    loss, F1, p, r = 0., 0., 0., 0.
    label_dict_confuse = {'TP':0., 'FN':0., 'FP':0.}
    for start in range(num_of_data):
        end = start + 1
        l,logits = sess.run([model.loss_val, model.logits], 
                        feed_dict={model.sentence: test_X[start:end], model.label_l1999: test_y[start:end],
                                   model.dropout_keep_prob:1.0})
        loss += l
        pre = np.argsort(logits[0])[-5:]
        label = [i for i in range(len(test_y[start])) if test_y[start][i] > 0]
        if start == 0: print('label:',label, 'predict:', pre)
        inter = len([x for x in pre if x in label])
        label_dict_confuse['TP'] += inter
        label_dict_confuse['FN'] += len(label) - inter
        label_dict_confuse['FP'] += len(pre) - inter
    p = float(label_dict_confuse['TP'])/(label_dict_confuse['TP']+label_dict_confuse['FP'])
    r = float(label_dict_confuse['TP'])/(label_dict_confuse['TP']+label_dict_confuse['FN'])
    if p + r == 0: return loss/num_of_data, 0, 0, 0
    F1 = (2 * p * r)/(p + r)
    return loss/num_of_data, F1/num_of_data, p/num_of_data, r/num_of_data


In [13]:
tf.reset_default_graph()
tf.app.run()

cache files exist, going to load in...
loading h5_file...
h5_file.keys: KeysView(<HDF5 file "data.h5" (mode r)>)
loading pickle file
cache files load successful!
test_X.shape: (20000, 200)
test_y.shape: (20000, 1999)
initialize variables
train_X[start, end]: [[832  60 256 ...   0   0   0]
 [270 154 166 ...   0   0   0]
 [186 163 284 ...   0   0   0]
 ...
 [ 96 138 117 ...   0   0   0]
 [ 56 109  96 ...   0   0   0]
 [ 32 127 420 ...   0   0   0]]
train_y[start, end]: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[2019/02/28 17:25:04] Epoch 0\Batch 100\ Train Loss:349.865
[2019/02/28 17:25:25] Epoch 0\Batch 200\ Train Loss:191.907
[2019/02/28 17:25:46] Epoch 0\Batch 300\ Train Loss:137.243
[2019/02/28 17:26:07] Epoch 0\Batch 400\ Train Loss:109.365
[2019/02/28 17:26:27] Epoch 0\Batch 500\ Train Loss:92.445
[2019/02/28 17:26:47] Epoch 0\Batch 600\ Train Loss:81.074
[2019/02/28 17

IndexError: index 0 is out of bounds for axis 0 with size 0