In [1]:
import time
def log(str):
    t = time.localtime()
    print("[%4d/%02d/%02d %02d:%02d:%02d]"%(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec), end=' ')
    print(str)

In [2]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


### 定义多目标分类FastText模型

In [3]:
class FastText:
    def __init__(self, label_size, batch_size, num_sampled, sentence_len, vocab_size, 
                 embed_size, learning_rate, decay_rate, decay_steps, is_training):
        #init all hyperparameter
        self.label_size = label_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.sentence_len = sentence_len
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.learning_rate = learning_rate
        self.is_training = is_training
        self.initializer = tf.random_normal_initializer(stddev=0.1)
        
        #add placeholder 
        self.sentence = tf.placeholder(dtype=tf.int32, shape=[None, sentence_len], name='sentence') #x
        #self.labels = tf.placeholder(dtype=tf.int32, shape=[None], name='label') #y
        self.label_l1999 = tf.placeholder(dtype=tf.float32, shape=[None, self.label_size])
        
        self.global_step = tf.Variable(0, name='Global_step', trainable=False)
        self.epoch_step = tf.Variable(0, name='Epoch_step', trainable=False)
        self.epoch_increment = tf.assign(self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.decay_rate, self.decay_steps = decay_rate, decay_steps
        
        self.instantiate_weights()
        self.logits = self.inference()
        self.loss_val = self.loss()
        self.train_op = self.train()
        
        #self.predictions = tf.argmax(self.logits, axis=1, name='predictions')
        #correct_prediction = tf.equal(tf.cast(self.predictions, tf.int32), self.labels)
        #self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='Accuracy')
        
    def instantiate_weights(self):
        self.Embedding = tf.get_variable(name='Embedding', shape=[self.vocab_size, self.embed_size], initializer=self.initializer)
        self.W = tf.get_variable(name='W', shape=[self.embed_size, self.label_size], initializer=self.initializer)
        self.b = tf.get_variable(name='b', shape=[self.label_size])
        
    def inference(self):
        sentence_embeddings = tf.nn.embedding_lookup(self.Embedding, self.sentence) #每个单词查表得到词向量，[None, sentence_len, embed_size]
        self.sentence_embeddings = tf.reduce_mean(sentence_embeddings, axis=1) #求平均，[None, embed_size]
        logits = tf.matmul(self.sentence_embeddings, self.W) + self.b #线性分类器
        return logits
    
    def loss(self, l2_lambda=0.01):
        #nce loss
#         if self.is_training:
#             labels = tf.reshape(self.labels, shape=[-1])
#             labels = tf.expand_dims(labels, dim=1)
#             loss = tf.reduce_mean(tf.nn.nce_loss(weights=tf.transpose(self.W), 
#                                                  biases=self.b, 
#                                                  labels=labels, 
#                                                  inputs=self.sentence_embeddings, 
#                                                  num_sampled=self.num_sampled, 
#                                                  num_classes=self.label_size, 
#                                                  partition_strategy='div'))
            
#         else:
#             pass
        '''多目标分类，使用sigmoid_cross_entropy_with_logits做损失函数'''
        loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.label_l1999, logits=self.logits)
        loss = tf.reduce_mean(tf.reduce_sum(loss, axis=1))
        #增加l2正则项
        print(tf.trainable_variables())
        self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda
        loss = loss + self.l2_loss
        return loss
    
    def train(self):
        learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay_rate, staircase=True)
        train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="Adam")
        return train_op

In [36]:
def test():
    num_classes=19
    learning_rate=0.01
    batch_size=8
    decay_steps=1000
    decay_rate=0.9
    sequence_length=5
    vocab_size=10000
    embed_size=100
    is_training=True
    dropout_keep_prob=1
    
    model = FastText(num_classes, batch_size, 5, sequence_length, vocab_size, 
                     embed_size, learning_rate, decay_rate, decay_steps, is_training)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        input_x = np.random.randint(0,100,size=(batch_size, sequence_length),dtype=np.int32)
        input_y = np.random.randint(0, 2,size=(batch_size, num_classes), dtype=np.int32)
        for i in range(1):
            #input_x = np.zeros((batch_size, sequence_length), dtype=np.int32)
            #input_y = np.array([1,0,1,1,1,2,1,1], dtype=np.int32)
            loss, logit, _ = sess.run([model.loss_val, model.logits, model.train_op],
                                            feed_dict={model.sentence: input_x, model.label_l1999: input_y})
            tmp = np.zeros((batch_size, num_classes),dtype=np.int32)
            for i in range(batch_size):
                top_num = int(input_y[i].sum())
                tmp[i][np.argsort(logit[i])[-top_num:]] = 1
            print('loss:',loss, 'label:', input_y,'**\n', 'logits:', tmp, 'acc:', (tmp==input_y).sum())

In [37]:
tf.reset_default_graph()
test()

[<tf.Variable 'Embedding:0' shape=(10000, 100) dtype=float32_ref>, <tf.Variable 'W:0' shape=(100, 19) dtype=float32_ref>, <tf.Variable 'b:0' shape=(19,) dtype=float32_ref>]
loss: 63.44268 label: [[1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 0 1 1 1]
 [1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 1 1]
 [1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0 1]
 [1 0 0 1 0 0 1 1 1 0 1 0 1 0 1 0 1 0 0]
 [0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 1]
 [0 0 1 0 0 1 0 0 0 1 1 1 1 0 1 0 0 0 0]
 [0 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 0 0]
 [0 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 0]] **
 logits: [[1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1]
 [1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0]
 [0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0]
 [1 1 0 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1 0]
 [1 1 0 1 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0]
 [1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0]
 [1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 0 1 1 0]
 [0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0]] acc: 78


### 训练

In [24]:
import sys
import tensorflow as tf
import numpy as np
from tflearn.data_utils import to_categorical, pad_sequences
import os
import word2vec
import pickle
import h5py

In [25]:
tf.reset_default_graph()
#define hyperparameter
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_integer('label_size', 1999, 'number of label')
tf.app.flags.DEFINE_integer('batch_size', 128, 'batch size for training')
tf.app.flags.DEFINE_integer('num_sampled', 50, 'number of noise sample')
tf.app.flags.DEFINE_integer('sentence_len', 200, 'length of each sentence')
tf.app.flags.DEFINE_integer('embed_size', 100, 'embedding size')
tf.app.flags.DEFINE_float('learning_rate', 0.01, '')
tf.app.flags.DEFINE_float('decay_rate', 0.8, '')
tf.app.flags.DEFINE_integer('decay_steps', 20000, 'number of steps before decay learning rate')
tf.app.flags.DEFINE_bool('is_training', True, '')

tf.app.flags.DEFINE_integer('num_epoch', 15, '')
tf.app.flags.DEFINE_integer('validation_every', 1, 'Validate every validate_every epochs.')
tf.app.flags.DEFINE_string("ckpt_dir","D:/zhihu_data/data/ieee_zhihu_cup/fast_text_multilabel_checkpoint/","checkpoint location for the model")
tf.app.flags.DEFINE_string("cache_path","D:/zhihu_data/data/ieee_zhihu_cup/fast_textmultilabel__checkpoint/data_cache.pik","data chche for the model")

tf.app.flags.DEFINE_bool('use_embedding', False, '')

DuplicateFlagError: The flag 'label_size' is defined twice. First from D:\Anaconda\lib\site-packages\ipykernel_launcher.py, Second from D:\Anaconda\lib\site-packages\ipykernel_launcher.py.  Description from first occurrence: number of label

In [58]:
#define main

#process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data & training (4.validation) 

def main(_):
    #1.加载数据
    base_path = 'D:/zhihu_data/data/ieee_zhihu_cup/'
    cache_file_h5py = base_path + 'data.h5'
    cache_file_pickle = base_path + 'vocab_label.pik'
    word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y = load_data(cache_file_h5py, cache_file_pickle)
    
    index2word = {index: word for word, index in word2index.items()}
    index2label = {index: label for label, index in label2index.items()}
    vocab_size = len(word2index)
    '''
    print("train_X.shape:", np.array(train_X).shape)
    print("train_y.shape:", np.array(train_y).shape)
    '''
    print("test_X.shape:", np.array(test_X).shape)  # 每个list代表一句话
    print("test_y.shape:", np.array(test_y).shape)  
    
    print("test_X[0]:", test_X[0])  
    print("test_X[1]:", test_X[1])
    print("test_y[0]:", test_y[0])  

    #2.创建session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = FastText(FLAGS.label_size, FLAGS.batch_size, FLAGS.num_sampled, FLAGS.sentence_len, 
                        vocab_size, FLAGS.embed_size, FLAGS.learning_rate, FLAGS.decay_rate, FLAGS.decay_steps, FLAGS.is_training)
        saver = tf.train.Saver()
        batch_size = FLAGS.batch_size
        CONTINUE_TRAIN = False
        if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
            print('restore model from checkpoint')
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            print('CONTINUE_TRAIN=', CONTINUE_TRAIN)
            sess.run(model.epoch_increment)
            print('Continue at Epoch:', sess.run(model.epoch_step))
        if not os.path.exists(FLAGS.ckpt_dir + 'checkpoint') or CONTINUE_TRAIN:
            if not os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
                log('initialize variables')
                sess.run(tf.global_variables_initializer())
                if FLAGS.use_embedding:
                    print('assign pre-trained embedding')
                    #embedding_assign = tf.assign(model.Embedding, tf.constant(np.array(embedding_final))) #为model.Embedding赋值
                    #sess.run(embedding_assign)

            #3.训练
            num_of_data = len(train_X)
            for _ in range(FLAGS.num_epoch):
                curr_epoch = sess.run(model.epoch_step)
                loss, acc, counter = 0.0, 0.0, 0
                for start, end in zip(range(0, num_of_data, batch_size), range(batch_size, num_of_data, batch_size)):
                    if (curr_epoch == 0 and counter == 0):
                        print('train_X[start, end]:', train_X[start:end])
                        print('train_y[start, end]:', train_y[start:end])
                    loss_tmp, l2, _ = sess.run([model.loss_val, model.l2_loss, model.train_op], 
                                feed_dict={model.sentence: train_X[start:end], model.label_l1999: train_y[start:end]})
                    loss, counter = loss+loss_tmp, counter+1

                    if (counter % 200 == 0):
                        print("Epoch %d\Batch %d\ Train Loss:%.3f"%(curr_epoch, counter, loss/float(counter)))

                #4.验证，每迭代完FLAGS.validation_every轮，在验证集上跑一次
                print(curr_epoch,FLAGS.validation_every,(curr_epoch % FLAGS.validation_every==0))
                if curr_epoch % FLAGS.validation_every == 0:
                    log('run model on validation data...')
                    loss_valid, acc_valid = do_eval(sess, model, vaild_X, valid_y)
                    log("Epoch %d\ Validation Loss:%.3f/ Validation Accuracy:%.3f"%(curr_epoch, loss_valid, acc_valid))
                    #save the checkpoint
                    save_path = FLAGS.ckpt_dir + 'model.ckpt'
                    saver.save(sess, save_path, global_step=model.epoch_step)
                sess.run(model.epoch_increment)
        loss_valid, acc_valid = do_eval(sess, model, vaild_X, valid_y, batch_size)
        print("Validation Loss:%.3f\ Validation Accuracy:%.3f"%(loss_valid, acc_valid))

def load_data(h5_file_path, pik_file_path):
    if not os.path.exists(h5_file_path) or not os.path.exists(pik_file_path):
        raise RuntimeError('No such file!!')

    log('cache files exist, going to load in...')
    log('loading h5_file...')
    h5_file = h5py.File(h5_file_path, 'r')
    print('h5_file.keys:', h5_file.keys())
    train_X, train_y = h5_file['train_X'], h5_file['train_Y']
    vaild_X, valid_y = h5_file['vaild_X'], h5_file['valid_Y']
    test_X,  test_y  = h5_file['test_X'],  h5_file['test_Y']
    #embedding_final = h5_file['embedding']

    log('loading pickle file')
    word2index, label2index = None, None
    with open(pik_file_path, 'rb') as pkl:
        word2index,label2index = pickle.load(pkl)
    log('cache files load successful!')
    return word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y

def do_eval(sess, model, test_X, test_y):
    loss, acc = 0.0, 0.0
    batch_size = 1
    for start, end in zip(range(0, len(test_X), batch_size), range(batch_size, len(test_X), batch_size)):
        l,pre = sess.run([model.loss_val, model.logits], 
                        feed_dict={model.sentence: test_X[start:end], model.label_l1999: test_y[start:end]})
        loss += l
        acc += calc_accuracy(pre[0], test_y[start])
    return loss/float(len(test_y)), acc/float(len(test_y))

def calc_accuracy(pre_row, label_row):
    label_row = np.array(label_row)
    top_num = int(np.sum(label_row))
    index_list=np.array(np.argsort(pre_row))[-top_num:]
    return label_row[index_list].sum()/float(top_num)

In [None]:
tf.reset_default_graph()
tf.app.run()

[2019/02/25 19:06:23] cache files exist, going to load in...
[2019/02/25 19:06:23] loading h5_file...
h5_file.keys: KeysView(<HDF5 file "data.h5" (mode r)>)
[2019/02/25 19:06:23] loading pickle file
[2019/02/25 19:06:23] cache files load successful!
test_X.shape: (20000, 200)
test_y.shape: (20000, 1999)
test_X[0]: [ 937  716  934  376  104  652  304  934  376   19  240  221  136   68
  188   96  130  130   96  209  505   12  703  143   12  652  304  934
  376   10   13  408   89   74   32  110  558  909 2519   12   80  181
   10  134  204  471  462  562   16    3  937  716  934  376   13   78
  180  531  937  307   78  245  157  937  716  934  376   10  105   13
   61  245  157  652  304  934  376   10  168   78   13  937  716  143
  109   10   63 1191  369   99   13  937  716  934  376   16    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0

Epoch 1\Batch 4400\ Train Loss:49.571
Epoch 1\Batch 4600\ Train Loss:49.571
Epoch 1\Batch 4800\ Train Loss:49.568
Epoch 1\Batch 5000\ Train Loss:49.569
Epoch 1\Batch 5200\ Train Loss:49.568
Epoch 1\Batch 5400\ Train Loss:49.563
Epoch 1\Batch 5600\ Train Loss:49.556
Epoch 1\Batch 5800\ Train Loss:49.556
Epoch 1\Batch 6000\ Train Loss:49.558
Epoch 1\Batch 6200\ Train Loss:49.561
Epoch 1\Batch 6400\ Train Loss:49.557
Epoch 1\Batch 6600\ Train Loss:49.556
Epoch 1\Batch 6800\ Train Loss:49.561
Epoch 1\Batch 7000\ Train Loss:49.561
Epoch 1\Batch 7200\ Train Loss:49.563
Epoch 1\Batch 7400\ Train Loss:49.561
Epoch 1\Batch 7600\ Train Loss:49.559
Epoch 1\Batch 7800\ Train Loss:49.560
Epoch 1\Batch 8000\ Train Loss:49.561
Epoch 1\Batch 8200\ Train Loss:49.559
Epoch 1\Batch 8400\ Train Loss:49.562
Epoch 1\Batch 8600\ Train Loss:49.567
Epoch 1\Batch 8800\ Train Loss:49.565
Epoch 1\Batch 9000\ Train Loss:49.565
Epoch 1\Batch 9200\ Train Loss:49.564
Epoch 1\Batch 9400\ Train Loss:49.564
Epoch 1\Batc

2 1 True
[2019/02/25 20:18:10] run model on validation data...
[2019/02/25 20:18:31] Epoch 2\ Validation Loss:49.552/ Validation Accuracy:0.017
Epoch 3\Batch 200\ Train Loss:49.602
Epoch 3\Batch 400\ Train Loss:49.587
Epoch 3\Batch 600\ Train Loss:49.500
Epoch 3\Batch 800\ Train Loss:49.482
Epoch 3\Batch 1000\ Train Loss:49.475
Epoch 3\Batch 1200\ Train Loss:49.497
Epoch 3\Batch 1400\ Train Loss:49.502
Epoch 3\Batch 1600\ Train Loss:49.502
Epoch 3\Batch 1800\ Train Loss:49.506
Epoch 3\Batch 2000\ Train Loss:49.507
Epoch 3\Batch 2200\ Train Loss:49.508
Epoch 3\Batch 2400\ Train Loss:49.509
Epoch 3\Batch 2600\ Train Loss:49.508
Epoch 3\Batch 2800\ Train Loss:49.502
Epoch 3\Batch 3000\ Train Loss:49.509
Epoch 3\Batch 3200\ Train Loss:49.506
Epoch 3\Batch 3400\ Train Loss:49.511
Epoch 3\Batch 3600\ Train Loss:49.509
Epoch 3\Batch 3800\ Train Loss:49.506
Epoch 3\Batch 4000\ Train Loss:49.501
Epoch 3\Batch 4200\ Train Loss:49.501
Epoch 3\Batch 4400\ Train Loss:49.511
Epoch 3\Batch 4600\ Trai