### 单标签模型定义

In [1]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [28]:
class TextCNN:
    def __init__(self, num_classes, batch_size, vocab_size, embed_size, sentence_len, 
                 learning_rate, decay_step, decay_rate, filter_num, filter_sizes):
        #1.定义超参数
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.sentence_len = sentence_len
        self.learning_rate = learning_rate
        self.filter_num = filter_num
        self.filter_sizes = filter_sizes #list，如[2,3,4],表示3个卷积核的长度（height）
        self.filter_num_total = filter_num * len(filter_sizes)
        self.initializer = tf.random_normal_initializer(stddev=0.1)
        
        #epoch信息
        self.global_epoch = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_epoch') #在指数衰减函数中会加一
        self.epoch_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='epoch_step')
        self.epoch_increment = tf.assign(self.epoch_step, self.epoch_step+tf.constant(1))
        self.decay_step = decay_step
        self.decay_rate = decay_rate
        
        #2.设置输入
        self.sentence = tf.placeholder(dtype=tf.int32, shape=[None, self.sentence_len], name='sentence')
        self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label')
        self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
        #self.dropout_keep_prob = 0.5
        
        #3.参数初始化
        self.instantiate_weight()
        #4.定义图
        self.logits = self.inference()
        
        #5.定义loss和train_op
        self.loss_val = self.loss()
        self.train_op = self.train()
        
        #6.预测，计算准确率
        self.prediction = tf.argmax(self.logits, axis=1, name='prediction')
        correct_pre = tf.equal(tf.cast(self.prediction, tf.int32), self.label)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pre, tf.float32))
    
    def instantiate_weight(self):
        self.Embedding = tf.get_variable('Embedding', [self.vocab_size, self.embed_size], tf.float32, initializer=self.initializer)
        self.W = tf.get_variable('weight', [self.filter_num_total, self.num_classes], tf.float32, initializer=self.initializer)
        self.b = tf.get_variable('b', [self.num_classes], dtype=tf.float32)
        
    def inference(self):
        #embedding -- 卷积 -- 线性分类器
        self.sentece_embedding = tf.nn.embedding_lookup(self.Embedding, self.sentence)
        h = self.cnn_single_layer()
        logits = tf.matmul(h, self.W) + self.b
        return logits
    
    def cnn_single_layer(self):
        #conv2d -- BN -- ReLU -- max_pooling -- dropout -- dense
        #conv2d的输入与卷积核都要求是4维的，具体查看文档
        sentece_embedding_4d = tf.expand_dims(self.sentece_embedding, -1) #增加一维，[batch_size, sentence_len, embed_size, 1]
        pool_output = []
        for filter_size in self.filter_sizes:
            with tf.variable_scope('convolution-pooling-%d'%filter_size):
                ft = tf.get_variable('filter%d'%filter_size, [filter_size, self.embed_size, 1, self.filter_num], 
                                     tf.float32, initializer=self.initializer)
                conv = tf.nn.conv2d(sentece_embedding_4d, ft, strides=[1,1,1,1], padding='VALID')
                conv = tf.contrib.layers.batch_norm(conv) #[batch_size, sentence_len-filter_size+1, 1, filter_num]
                activation = tf.nn.relu(conv)
                
                pooled = tf.nn.max_pool(activation, ksize=[1,self.sentence_len-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID')
                pool_output.append(pooled) #若干个shape=[batch_size, 1, 1, filter_num]
        pool_concat = tf.concat(pool_output, axis=3) #在第三维拼接
        flatten_pool = tf.reshape(pool_concat, [-1, self.filter_num_total])
        
        dropouted = tf.nn.dropout(flatten_pool, keep_prob=self.dropout_keep_prob)
        h = tf.layers.dense(dropouted, self.filter_num_total, activation=tf.nn.tanh)
        return h
        
    def loss(self, l2_lambda=0.001):
        loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.label, logits=self.logits)
        #先将labels转化为one-hot，再计算softmax交叉熵
        loss1 = tf.reduce_mean(loss1)
        l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name], name='l2_loss') * l2_lambda
        loss = loss1 + l2_loss
        return loss
                
    def train(self):
        learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_epoch, 
                                                   self.decay_step, self.decay_rate, staircase=True)
        train_op = tf.contrib.layers.optimize_loss(self.loss_val, self.global_epoch, learning_rate, optimizer='Adam')
        return train_op

In [35]:
def test():
    num_classes=19
    learning_rate=0.01
    batch_size=15
    decay_step=1000
    decay_rate=0.9
    sequence_length=5
    vocab_size=10000
    embed_size=100
    is_training=True
    dropout_keep_prob=0.5
    
    model = TextCNN(num_classes, batch_size, vocab_size, embed_size, sequence_length,
                     learning_rate, decay_step, decay_rate, 50, [2,3,4])
    print(tf.trainable_variables())
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        input_x = np.random.randint(0,100,size=(batch_size, sequence_length),dtype=np.int32)
        input_y = np.random.randint(0, 19,size=(batch_size), dtype=np.int32)
        for i in range(20):
            #input_x = np.zeros((batch_size, sequence_length), dtype=np.int32)
            #input_y = np.array([1,0,1,1,1,2,1,1], dtype=np.int32)
            loss, acc, predict, _ = sess.run([model.loss_val, model.accuracy, model.prediction, model.train_op],
                                            feed_dict={model.sentence: input_x, model.label: input_y,
                                                       model.dropout_keep_prob: dropout_keep_prob})
            print('loss:',loss, 'acc:', acc, 'label:', input_y, 'predict:', predict)

In [36]:
tf.reset_default_graph()
test()

[<tf.Variable 'Embedding:0' shape=(10000, 100) dtype=float32_ref>, <tf.Variable 'weight:0' shape=(150, 19) dtype=float32_ref>, <tf.Variable 'b:0' shape=(19,) dtype=float32_ref>, <tf.Variable 'convolution-pooling-2/filter2:0' shape=(2, 100, 1, 50) dtype=float32_ref>, <tf.Variable 'convolution-pooling-2/BatchNorm/beta:0' shape=(50,) dtype=float32_ref>, <tf.Variable 'convolution-pooling-3/filter3:0' shape=(3, 100, 1, 50) dtype=float32_ref>, <tf.Variable 'convolution-pooling-3/BatchNorm/beta:0' shape=(50,) dtype=float32_ref>, <tf.Variable 'convolution-pooling-4/filter4:0' shape=(4, 100, 1, 50) dtype=float32_ref>, <tf.Variable 'convolution-pooling-4/BatchNorm/beta:0' shape=(50,) dtype=float32_ref>, <tf.Variable 'dense/kernel:0' shape=(150, 150) dtype=float32_ref>, <tf.Variable 'dense/bias:0' shape=(150,) dtype=float32_ref>]
loss: 8.474031 acc: 0.06666667 label: [ 4 14 10 16  2  2  1 14 18  0 15  2 13 11  2] predict: [16 14  6 14  7  7  0  7 14 16  7  0 14  9 14]
loss: 6.410391 acc: 0.333333

### 训练

In [37]:
import sys
import tensorflow as tf
import numpy as np
from tflearn.data_utils import to_categorical, pad_sequences
import os
import pickle
import h5py

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [38]:
tf.reset_default_graph()
#define hyperparameter
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_integer('label_size', 1999, 'number of label')
tf.app.flags.DEFINE_integer('batch_size', 128, 'batch size for training')

tf.app.flags.DEFINE_integer('sentence_len', 200, 'length of each sentence')
tf.app.flags.DEFINE_integer('embed_size', 100, 'embedding size')
tf.app.flags.DEFINE_float('learning_rate', 0.01, '')
tf.app.flags.DEFINE_float('decay_rate', 0.8, '')
tf.app.flags.DEFINE_integer('decay_steps', 20000, 'number of steps before decay learning rate')
tf.app.flags.DEFINE_bool('is_training', True, '')

tf.app.flags.DEFINE_integer('num_epoch', 15, '')
tf.app.flags.DEFINE_integer('validation_every', 1, 'Validate every validate_every epochs.')
tf.app.flags.DEFINE_string("ckpt_dir","D:/zhihu_data/data/ieee_zhihu_cup2/textcnn_checkpoint/","checkpoint location for the model")
tf.app.flags.DEFINE_string("cache_path","D:/zhihu_data/data/ieee_zhihu_cup2/textcnn_checkpoint/data_cache.pik","data chche for the model")

tf.app.flags.DEFINE_integer("num_filters", 128, "number of filters") #256--->512

In [40]:
import time
def log(str):
    t = time.localtime()
    print("[%4d/%02d/%02d %02d:%02d:%02d]"%(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec), end=' ')
    print(str)

In [41]:
#define main

#process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data & training (4.validation) 

def main(_):
    #1.加载数据
    base_path = 'D:/zhihu_data/data/ieee_zhihu_cup2/'
    cache_file_h5py = base_path + 'data.h5'
    cache_file_pickle = base_path + 'vocab_label.pik'
    word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y,embedding_final = load_data(cache_file_h5py, cache_file_pickle)
    
    index2word = {index: word for word, index in word2index.items()}
    index2label = {index: label for label, index in label2index.items()}
    vocab_size = len(word2index)

    print("train_X.shape:", np.array(train_X).shape)
    print("train_y.shape:", np.array(train_y).shape)
    print("test_X.shape:", np.array(test_X).shape)  # 每个list代表一句话
    print("test_y.shape:", np.array(test_y).shape)  
    print("test_X[0]:", test_X[0])  
    print("test_X[1]:", test_X[1])
    print("test_y[0]:", test_y[0])  

    #2.创建session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = TextCNN(FLAGS.label_size, FLAGS.batch_size, vocab_size, 
                        FLAGS.embed_size, FLAGS.sentence_len, FLAGS.learning_rate, 
                        FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_filters, [2,3,4])
        saver = tf.train.Saver()
        batch_size = FLAGS.batch_size
        CONTINUE_TRAIN = False
        if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
            print('restore model from checkpoint')
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            print('CONTINUE_TRAIN=', CONTINUE_TRAIN)
            sess.run(model.epoch_increment)
            print('Continue at Epoch:', sess.run(model.epoch_step))
        if not os.path.exists(FLAGS.ckpt_dir + 'checkpoint') or CONTINUE_TRAIN:
            if not os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
                print('initialize variables')
                sess.run(tf.global_variables_initializer())
                print('assign pre-trained embedding')
                embedding_assign = tf.assign(model.Embedding, tf.constant(np.array(embedding_final))) #为model.Embedding赋值
                sess.run(embedding_assign)

            #3.训练
            num_of_data = len(train_y)
            for _ in range(FLAGS.num_epoch):
                epoch = model.epoch_step
                loss, acc, counter = 0.0, 0.0, 0
                for start, end in zip(range(0, num_of_data, batch_size), range(batch_size, num_of_data, batch_size)):
                    if (epoch == 0 and counter == 0):
                        print('train_X[start, end]:', train_X[start:end])
                        print('train_y[start, end]:', train_y[start:end])
                    l,a,_ = sess.run([model.loss_val, model.accuracy, model.train_op], 
                                feed_dict={model.sentence: train_X[start:end], model.label: train_y[start:end],
                                           model.dropout_keep_prob: 0.5})
                    loss, acc, counter = loss+l, acc+a, counter+1

                    if (counter % 500 == 0):
                        log("Epoch %d\Batch %d\ Train Loss:%.3f\ Train Accuracy:%.3f"%(epoch, 
                                                                                         counter, loss/float(counter), acc/float(counter)))

                #4.验证，每迭代完FLAGS.validation_every轮，在验证集上跑一次
                print(epoch,FLAGS.validation_every,(epoch % FLAGS.validation_every==0))
                if epoch % FLAGS.validation_every == 0:
                    print('run model on validation data...')
                    loss_valid, acc_valid = do_eval(sess, model, vaild_X, valid_y, batch_size)
                    log("Epoch %d\ Validation Loss:%.3f/ Validation Accuracy:%.3f"%(epoch, loss_valid, acc_valid))
                    #save the checkpoint
                    save_path = FLAGS.ckpt_dir + 'model.ckpt'
                    saver.save(sess, save_path, global_step=model.epoch_step)
                sess.run(model.epoch_increment)
        loss_valid, acc_valid = do_eval(sess, model, vaild_X, valid_y, batch_size)
        log("Validation Loss:%.3f\ Validation Accuracy:%.3f"%(loss_valid, acc_valid))

def load_data(h5_file_path, pik_file_path):
    if not os.path.exists(h5_file_path) or not os.path.exists(pik_file_path):
        raise RuntimeError('No such file!!')

    print('cache files exist, going to load in...')
    print('loading h5_file...')
    h5_file = h5py.File(h5_file_path, 'r')
    print('h5_file.keys:', h5_file.keys())
    train_X, train_y = h5_file['train_X'], h5_file['train_Y']
    vaild_X, valid_y = h5_file['vaild_X'], h5_file['valid_Y']
    test_X,  test_y  = h5_file['test_X'],  h5_file['test_Y']
    embedding_final = h5_file['embedding']

    print('loading pickle file')
    word2index, label2index = None, None
    with open(pik_file_path, 'rb') as pkl:
        word2index,label2index = pickle.load(pkl)
    print('cache files load successful!')
    return word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y, embedding_final

def do_eval(sess, model, test_X, test_y, batch_size):
    num_of_data = len(test_y)
    loss, acc, counter = 0.0, 0.0, 0
    for start, end in zip(range(0, num_of_data, batch_size), range(batch_size, num_of_data, batch_size)):
        l,a = sess.run([model.loss_val, model.accuracy], 
                        feed_dict={model.sentence: test_X[start:end], model.labels: test_y[start:end]})
        loss, acc, counter = loss+l, acc+a, counter+1
    return loss/float(counter), acc/float(counter)


In [42]:
tf.app.run()

cache files exist, going to load in...
loading h5_file...
h5_file.keys: KeysView(<HDF5 file "data.h5" (mode r)>)
loading pickle file
cache files load successful!
train_X.shape: (2959966, 200)
train_y.shape: (2959966,)
test_X.shape: (20000, 200)
test_y.shape: (20000,)
test_X[0]: [ 579  343 1173 1843    5  583  292 1173 1843    5 1180 1299  989   10
    2   68  153  168  531  109  260  217  277   81   59   81  116  514
    6  221  253  224  154  718  553    4  806  538  732  264   74    6
  221  224  154  326   11  167  136    4  257  145   37   74  175  214
   11   57  110  221    6  364   89   20 4050 2344    4  257   78    9
  991  326  221   89  699  133   11  597  679 1957  824  884  871 1957
  824    4  178   87   87   78  196   52  552   69   47   20   12   37
 1371   89    6  755  779   81  667  597    4  586  878    6   35   93
    7  719  285  937   35  162   13   11    7 1371   89   35    4  201
   68   81   97 1533   81  667  597    9  991  326   35  343  704   16
    5   99 

TypeError: %d format: a number is required, not Variable