### 定义FastText模型

In [1]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
class FastText:
    def __init__(self, label_size, batch_size, num_sampled, sentence_len, vocab_size, 
                 embed_size, learning_rate, decay_rate, decay_steps, is_training):
        #init all hyperparameter
        self.label_size = label_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.sentence_len = sentence_len
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.learning_rate = learning_rate
        self.is_training = is_training
        
        #add placeholder 
        self.sentence = tf.placeholder(dtype=tf.int32, shape=[None, sentence_len], name='sentence') #x
        self.labels = tf.placeholder(dtype=tf.int32, shape=[None], name='label') #y
        
        self.global_step = tf.Variable(0, dtype=tf.int32, name='Global_step')
        self.epoch_step = tf.Variable(0, dtype=tf.int32, name='Epoch_step')
        self.epoch_increment = tf.assign(self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.decay_rate, self.decay_steps = decay_rate, decay_steps
        
        self.instantiate_weights()
        self.logits = self.inference()
        self.loss_val = self.loss()
        self.train_op = self.train()
        
        self.predictions = tf.argmax(self.logits, axis=1, name='predictions')
        correct_prediction = tf.equal(tf.cast(self.predictions, tf.int32), self.labels)
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='Accuracy')
        
    def instantiate_weights(self):
        self.Embedding = tf.get_variable(name='Embedding', shape=[self.vocab_size, self.embed_size])
        self.W = tf.get_variable(name='W', shape=[self.embed_size, self.label_size])
        self.b = tf.get_variable(name='b', shape=[self.label_size])
        
    def inference(self):
        sentence_embeddings = tf.nn.embedding_lookup(self.Embedding, self.sentence) #每个单词查表得到词向量，[None, sentence_len, embed_size]
        self.sentence_embeddings = tf.reduce_mean(sentence_embeddings, axis=1) #求平均，[None, embed_size]
        logits = tf.matmul(self.sentence_embeddings, self.W) + self.b #线性分类器
        return logits
    
    def loss(self, l2_lambda=0.01):
        #nce loss
        if self.is_training:
            labels = tf.reshape(self.labels, shape=[-1])
            labels = tf.expand_dims(labels, dim=1)
            loss = tf.reduce_mean(tf.nn.nce_loss(weights=tf.transpose(self.W), 
                                                 biases=self.b, 
                                                 labels=labels, 
                                                 inputs=self.sentence_embeddings, 
                                                 num_sampled=self.num_sampled, 
                                                 num_classes=self.label_size, 
                                                 partition_strategy='div'))
            
        else:
            pass
        return loss
    
    def train(self):
        learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay_rate, staircase=True)
        train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="Adam")
        return train_op

In [48]:
def test():
    num_classes=19
    learning_rate=0.01
    batch_size=8
    decay_steps=1000
    decay_rate=0.9
    sequence_length=5
    vocab_size=10000
    embed_size=100
    is_training=True
    dropout_keep_prob=1
    
    model = FastText(num_classes, batch_size, 5, sequence_length, vocab_size, 
                     embed_size, learning_rate, decay_rate, decay_steps, is_training)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        input_x = np.random.randint(0,100,size=(batch_size, sequence_length),dtype=np.int32)
        input_y = np.random.randint(0, 19,size=(batch_size), dtype=np.int32)
        for i in range(150):
            #input_x = np.zeros((batch_size, sequence_length), dtype=np.int32)
            #input_y = np.array([1,0,1,1,1,2,1,1], dtype=np.int32)
            loss, acc, predict, _ = sess.run([model.loss_val, model.accuracy, model.predictions, model.train_op],
                                            feed_dict={model.sentence: input_x, model.labels: input_y})
            print('loss:',loss, 'acc:', acc, 'label:', input_y, 'predict:', predict)

In [49]:
tf.reset_default_graph()
#test()

loss: 7.61115 acc: 0.0 label: [15 18 12 17 13  6 14  3] predict: [16 16 16 16 16 16 16 16]
loss: 5.94749 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [16 14 14 16 14 14 14 16]
loss: 6.67814 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [14 14 14 14 14 14 14 14]
loss: 7.99756 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [14 14 14 14 14 14 14 14]
loss: 7.34867 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [14 14 14 14 14 14 14 14]
loss: 7.05527 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [14 14 14 14 14 14 14 14]
loss: 5.87476 acc: 0.25 label: [15 18 12 17 13  6 14  3] predict: [14 14 14 17 14 17 14 14]
loss: 4.77102 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [17 17 17 17 17 17 17 17]
loss: 4.19791 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [17 17 17 17 17 17 17 17]
loss: 4.71784 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [17 17 17 17 17 17 17 17]
loss: 7.00724 acc: 0.125 label: [15 18 12 17 13  6 14  3] predict: [17 17

loss: 0.575029 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 0.783192 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 1.29666 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 0.377832 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 1.11085 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 0.320394 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 1.09144 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 0.28393 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 0.327157 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 1.38753 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 14  3]
loss: 1.29629 acc: 1.0 label: [15 18 12 17 13  6 14  3] predict: [15 18 12 17 13  6 1

In [4]:
tf.reset_default_graph()

In [3]:
import sys
import tensorflow as tf
import numpy as np
from tflearn.data_utils import to_categorical, pad_sequences
import os
import word2vec
import pickle
import h5py

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


### 训练

In [4]:
tf.reset_default_graph()
#define hyperparameter
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_integer('label_size', 1999, 'number of label')
tf.app.flags.DEFINE_integer('batch_size', 128, 'batch size for training')
tf.app.flags.DEFINE_integer('num_sampled', 50, 'number of noise sample')
tf.app.flags.DEFINE_integer('sentence_len', 200, 'length of each sentence')
tf.app.flags.DEFINE_integer('embed_size', 100, 'embedding size')
tf.app.flags.DEFINE_float('learning_rate', 0.01, '')
tf.app.flags.DEFINE_float('decay_rate', 0.8, '')
tf.app.flags.DEFINE_integer('decay_steps', 20000, 'number of steps before decay learning rate')
tf.app.flags.DEFINE_bool('is_training', True, '')

tf.app.flags.DEFINE_integer('num_epoch', 15, '')
tf.app.flags.DEFINE_integer('validation_every', 1, 'Validate every validate_every epochs.')
tf.app.flags.DEFINE_string("ckpt_dir","D:/zhihu_data/data/ieee_zhihu_cup2/fast_text_checkpoint/","checkpoint location for the model")
tf.app.flags.DEFINE_string("cache_path","D:/zhihu_data/data/ieee_zhihu_cup2/fast_text_checkpoint/data_cache.pik","data chche for the model")

In [5]:
#define main

#process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data & training (4.validation) 

def main(_):
	#1.加载数据
	base_path = 'D:/zhihu_data/data/ieee_zhihu_cup2/'
	cache_file_h5py = base_path + 'data.h5'
	cache_file_pickle = base_path + 'vocab_label.pik'
	word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y,embedding_final = load_data(cache_file_h5py, cache_file_pickle)
	
	index2word = {index: word for word, index in word2index.items()}
	index2label = {index: label for label, index in label2index.items()}
	vocab_size = len(word2index)
	
	print("train_X.shape:", np.array(train_X).shape)
	print("train_y.shape:", np.array(train_y).shape)
	print("test_X.shape:", np.array(test_X).shape)  # 每个list代表一句话
	print("test_y.shape:", np.array(test_y).shape)  
	print("test_X[0]:", test_X[0])  
	print("test_X[1]:", test_X[1])
	print("test_y[0]:", test_y[0])  

	#2.创建session
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess:
		model = FastText(FLAGS.label_size, FLAGS.batch_size, FLAGS.num_sampled, FLAGS.sentence_len, 
						vocab_size, FLAGS.embed_size, FLAGS.learning_rate, FLAGS.decay_rate, FLAGS.decay_steps, FLAGS.is_training)
		saver = tf.train.Saver()
		if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
			print('restore model from checkpoint')
			saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
		else:
			print('initialize variables')
			sess.run(tf.global_variables_initializer())
			print('assign pre-trained embedding')
			embedding_assign = tf.assign(model.Embedding, tf.constant(np.array(embedding_final))) #为model.Embedding赋值
			sess.run(embedding_assign)
			curr_epoch = sess.run(model.epoch_step)
			
			#3.训练
			num_of_data = len(train_y)
			batch_size = FLAGS.batch_size
			for epoch in range(FLAGS.num_epoch):
				loss, acc, counter = 0.0, 0.0, 0
				for start, end in zip(range(0, num_of_data, batch_size), range(batch_size, num_of_data, batch_size)):
					if (epoch == 0 and counter == 0):
						print('train_X[start, end]:', train_X[start:end])
						print('train_y[start, end]:', train_y[start:end])
					l,a,_ = sess.run([model.loss_val, model.accuracy, model.train_op], 
								feed_dict={model.sentence: train_X[start:end], model.labels: train_y[start:end]})
					loss, acc, counter = loss+l, acc+a, counter+1
					
					if (counter % 500 == 0):
						print("Epoch %d\Batch %d\ Train Loss:%.3f\ Train Accuracy:%.3f"%(epoch, counter, loss/float(counter), acc/float(counter)))
				
				#4.验证，每迭代完FLAGS.validation_every轮，在验证集上跑一次
				print(epoch,FLAGS.validation_every,(epoch % FLAGS.validation_every==0))
				if epoch % FLAGS.validation_every == 0:
					print('run model on validation data...')
					loss_valid, acc_valid = do_eval(sess, model, vaild_X, valid_y, batch_size)
					print("Epoch %d\ Validation Loss:%.3f/ Validation Accuracy:%.3f"%(epoch, loss_valid, acc_valid))
					#save the checkpoint
					save_path = FLAGS.ckpt_dir + 'model.ckpt'
					saver.save(sess, save_path, global_step=model.epoch_step)
				sess.run(model.epoch_increment)
		loss_valid, acc_valid = do_eval(sess, model, vaild_X, valid_y, batch_size)
		print("Epoch %d\ Validation Loss:%.3f/ Validation Accuracy:%.3f"%(epoch, loss_valid, acc_valid))
	
def load_data(h5_file_path, pik_file_path):
	if not os.path.exists(h5_file_path) or not os.path.exists(pik_file_path):
		raise RuntimeError('No such file!!')
	
	print('cache files exist, going to load in...')
	print('loading h5_file...')
	h5_file = h5py.File(h5_file_path, 'r')
	print('h5_file.keys:', h5_file.keys())
	train_X, train_y = h5_file['train_X'], h5_file['train_Y']
	vaild_X, valid_y = h5_file['vaild_X'], h5_file['valid_Y']
	test_X,  test_y  = h5_file['test_X'],  h5_file['test_Y']
	embedding_final = h5_file['embedding']
	
	print('loading pickle file')
	word2index, label2index = None, None
	with open(pik_file_path, 'rb') as pkl:
		word2index,label2index = pickle.load(pkl)
	print('cache files load successful!')
	return word2index,label2index,train_X,train_y,vaild_X,valid_y,test_X,test_y, embedding_final
	
def do_eval(sess, model, test_X, test_y, batch_size):
	num_of_data = len(test_y)
	loss, acc, counter = 0.0, 0.0, 0
	for start, end in zip(range(0, num_of_data, batch_size), range(batch_size, num_of_data, batch_size)):
		l,a = sess.run([model.loss_val, model.accuracy], 
						feed_dict={model.sentence: test_X[start:end], model.labels: test_y[start:end]})
		loss, acc, counter = loss+l, acc+a, counter+1
	return loss/float(counter), acc/float(counter)
		

In [None]:
tf.app.run()

cache files exist, going to load in...
loading h5_file...
h5_file.keys: KeysView(<HDF5 file "data.h5" (mode r)>)
loading pickle file
cache files load successful!
train_X.shape: (2959966, 200)
train_y.shape: (2959966,)
test_X.shape: (20000, 200)
test_y.shape: (20000,)
test_X[0]: [ 579  343 1173 1843    5  583  292 1173 1843    5 1180 1299  989   10
    2   68  153  168  531  109  260  217  277   81   59   81  116  514
    6  221  253  224  154  718  553    4  806  538  732  264   74    6
  221  224  154  326   11  167  136    4  257  145   37   74  175  214
   11   57  110  221    6  364   89   20 4050 2344    4  257   78    9
  991  326  221   89  699  133   11  597  679 1957  824  884  871 1957
  824    4  178   87   87   78  196   52  552   69   47   20   12   37
 1371   89    6  755  779   81  667  597    4  586  878    6   35   93
    7  719  285  937   35  162   13   11    7 1371   89   35    4  201
   68   81   97 1533   81  667  597    9  991  326   35  343  704   16
    5   99 

Epoch 1\Batch 18000\ Train Loss:2.216\ Train Accuracy:0.179
Epoch 1\Batch 18500\ Train Loss:2.212\ Train Accuracy:0.179
Epoch 1\Batch 19000\ Train Loss:2.209\ Train Accuracy:0.180
Epoch 1\Batch 19500\ Train Loss:2.207\ Train Accuracy:0.180
Epoch 1\Batch 20000\ Train Loss:2.206\ Train Accuracy:0.181
Epoch 1\Batch 20500\ Train Loss:2.204\ Train Accuracy:0.181
Epoch 1\Batch 21000\ Train Loss:2.201\ Train Accuracy:0.181
Epoch 1\Batch 21500\ Train Loss:2.199\ Train Accuracy:0.182
Epoch 1\Batch 22000\ Train Loss:2.198\ Train Accuracy:0.182
Epoch 1\Batch 22500\ Train Loss:2.196\ Train Accuracy:0.183
Epoch 1\Batch 23000\ Train Loss:2.194\ Train Accuracy:0.183
1 1 True
run model on validation data...
Epoch 1\ Validation Loss:2.161/ Validation Accuracy:0.199
Epoch 2\Batch 500\ Train Loss:2.109\ Train Accuracy:0.206
Epoch 2\Batch 1000\ Train Loss:2.113\ Train Accuracy:0.205
Epoch 2\Batch 1500\ Train Loss:2.108\ Train Accuracy:0.205
Epoch 2\Batch 2000\ Train Loss:2.104\ Train Accuracy:0.206
Epoch 

In [22]:
os.path.exists

Unnamed: 0,0,1,2,3
0,8,4,1,1
1,0,2,2,4
2,4,4,1,3


In [23]:
for idx, row in b.iterrows():
    print(idx, row)

0 0    8
1    4
2    1
3    1
Name: 0, dtype: int32
1 0    0
1    2
2    2
3    4
Name: 1, dtype: int32
2 0    4
1    4
2    1
3    3
Name: 2, dtype: int32
