In [1]:

import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics

from modelOneConv import *
from dataOwn import *
import pickle


In [2]:
class CnnKMaxPoolModel:
    def __init__(self):
        
        self.config = TCNN_K_Config()
        self.vocabulary_word2index, self.vocabulary_index2word  = create_voabulary(self.config.word2vec_model_path)
        self.config.vocab_size = len(self.vocabulary_word2index)+1
        #这里通过实际的word2vec模型统计词典中词的数量，赋值到config中，然后加载RNN模型
        self.model = CNN_K_MAXPOOL_DISEASE(self.config)
        print(self.config.vocab_size)
        self.word2vecModel = Word2Vec.load(self.config.word2vec_model_path)
    
        self.save_dir = 'checkpoints/textrnn'
        self.save_path = os.path.join(self.save_dir, 'best_validation')  # 最佳验证结果保存路径
        
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        #if not os.path.exists(save_dir):
        #    os.makedirs(save_dir)
        #saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
        
    def get_time_dif(self,start_time):
        """获取已使用时间"""
        end_time = time.time()
        time_dif = end_time - start_time
        return timedelta(seconds=int(round(time_dif)))


    def feed_data(self,x_batch, y_batch, keep_prob):
        feed_dict = {
            self.model.input_x: x_batch,
            self.model.input_y: y_batch,
            self.model.dropout_keep_prob: keep_prob
        }
        return feed_dict


    def evaluate(self,sess, x, y):
        """评估在某一数据上的准确率和损失"""
        #batch_eval = batch_iter(x_, y_, config.batch_size)
        batch_size = self.config.batch_size
        total_loss = 0.0
        total_acc = 0.0
        eval_out = []
        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1

        indices = np.random.permutation(np.arange(data_len))
        print("data_len=", data_len, " num_batch=", num_batch)
        x = np.array(x)
        y = np.array(y)
        x_shuffle = x[indices]
        y_shuffle = y[indices]

        for i in range(num_batch):
            start_id = i * batch_size
            end_id = min((i + 1) * batch_size, data_len)
            x_batch, y_batch =  x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

            batch_len = len(x_batch)
            feed_dict = self.feed_data(x_batch, y_batch, 1.0)
            loss, acc, y_pred_cls = sess.run([self.model.loss, self.model.acc, self.model.y_pred_cls], feed_dict=feed_dict)
            total_loss += loss * batch_len
            total_acc += acc * batch_len
            eval_out = np.concatenate([eval_out, y_pred_cls])

        return total_loss / data_len, total_acc / data_len, eval_out, y_shuffle
    
    def train(self, x_train, y_train, x_val, y_val):
        print("Configuring TensorBoard and Saver...")
        # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
        tensorboard_dir = 'tensorboard/textcnn-k-maxpool'
        if not os.path.exists(tensorboard_dir):
            os.makedirs(tensorboard_dir)

        tf.summary.scalar("loss", self.model.loss)
        tf.summary.scalar("accuracy", self.model.acc)
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter(tensorboard_dir)

        # 配置 Saver
        #saver = tf.train.Saver()
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        print("Loading training and validation data...")
        # 载入训练集与验证集
        start_time = time.time()
        #x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
        #x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
        #time_dif = get_time_dif(start_time)
        #print("Time usage:", time_dif)

        # 创建session
        session = tf.Session()
        session.run(tf.global_variables_initializer())
        writer.add_graph(session.graph)

        #y_real = session.run(tf.argmax(y_val, 1, name="predictions_real"))

        if self.config.use_embedding:  # load pre-trained word embedding
            assign_pretrained_word_embedding(session, self.model, self.word2vecModel,self.config.embed_size )

        print('Training and evaluating...')
        start_time = time.time()
        total_batch = 0  # 总批次
        best_acc_val = 0.0  # 最佳验证集准确率
        last_improved = 0  # 记录上一次提升批次
        require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练

        flag = False
        for epoch in range(self.config.num_epochs):
            print('Epoch:', epoch + 1)
            batch_train = batch_iter(x_train, y_train, self.config.batch_size)
            for x_batch, y_batch in batch_train:
                feed_dict = self.feed_data(x_batch, y_batch, self.config.dropout_keep_prob)
                #print("total_batch=", total_batch,"config.save_per_batch=",config.save_per_batch)
                #print("total_batch=",total_batch,config.print_per_batch )
                if total_batch % self.config.print_per_batch == 1:
                    # 每多少轮次输出在训练集和验证集上的性能
                    total_loss, total_acc, eval_out, y_real = self.evaluate(session, x_val, y_val)
                    #y_realNew = session.run(y_real)
                    print("total_batch=", total_batch, " total_loss=", total_loss, " total_acc=", total_acc)
                    print(y_real[0:10])
                    print(eval_out[0:10])
                    print(metrics.classification_report(y_real,eval_out))

                    if total_acc > best_acc_val:
                        # 保存最好结果
                        best_acc_val = total_acc
                        last_improved = total_batch
                        self.saver.save(sess=session, save_path=self.save_path)
                        improved_str = '*'
                        print("==========best==========")

                        print(metrics.classification_report(y_real,eval_out))
                    else:
                        improved_str = ''
                if total_batch % self.config.save_per_batch == 1:
                    # 每多少轮次将训练结果写入tensorboard scalar
                    s = session.run(merged_summary, feed_dict=feed_dict)
                    writer.add_summary(s, total_batch)


                    time_dif = self.get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                          + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                    #print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

                _ = session.run(self.model.train_op, feed_dict=feed_dict)  # 运行优化
                #logits = session.run(model.logits, feed_dict = feed_dict)
                #print(logits[0:10])
                #y_pred_cls = session.run(model.y_pred_cls, feed_dict = feed_dict)
                #print(y_pred_cls[0:10])
                total_batch += 1

                #if total_batch - last_improved > require_improvement:
                #    # 验证集正确率长期不提升，提前结束训练
                #    print("No optimization for a long time, auto-stopping...")
                #    flag = True
                #    break  # 跳出循环

            if flag:  # 同上
                break

In [3]:
modelCNN = CnnKMaxPoolModel()

weight shape: [7, 100, 1, 6]
weight shape: [1200, 100]
weight shape: [100, 14]
input shape0= (?, 50)
sent_embed shape= Tensor("inference/embedding_lookup:0", shape=(?, 50, 100), dtype=float32)
input shape: (?, 50, 100, 1)
input_unstack shape: 100
conv1-con shape= (?, 50, 100, 6)
conv1-kemax-pool shape= (?, 50, 100, 6)
trained shape= (?, 1200)
out shape= (?, 14)
232015


In [4]:
print("load datasets ...")
x_train, y_train = loadTrainOrTest_data_oneLabel(modelCNN.config.train_data_path, modelCNN.config.train_label_path,modelCNN.vocabulary_word2index)
x_val, y_val = loadTrainOrTest_data_oneLabel(modelCNN.config.test_data_path, modelCNN.config.test_label_path, modelCNN.vocabulary_word2index)

load datasets ...


In [5]:
print("start padding ...")
x_train = pad_sequences(x_train, modelCNN.config.sentence_length)  # padding to max length
x_val = pad_sequences(x_val, modelCNN.config.sentence_length)  # padding to max length

start padding ...


In [6]:
modelCNN.train(x_train, y_train, x_val, y_val)

Configuring TensorBoard and Saver...
Loading training and validation data...
vocab_size= 232014
word. exists embedding: 232014  ;word not exist embedding: 0
using pre-trained word emebedding.ended...
Training and evaluating...
Epoch: 1
data_len= 86095  num_batch= 1346
data_len= 21167  num_batch= 331
total_batch= 1  total_loss= 70.4453667204  total_acc= 0.0746444937924
[ 6 10  7  2  7 12  8  4  6  4]
[ 8.  8.  8.  8.  8.  8.  8.  8.  8.  8.]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1577
          1       0.00      0.00      0.00      1589
          2       0.00      0.00      0.00      1590
          3       0.00      0.00      0.00      1583
          4       0.00      0.00      0.00      1598
          5       0.00      0.00      0.00      1599
          6       0.00      0.00      0.00      1591
          7       0.00      0.00      0.00      1587
          8       0.07      1.00      0.14      1580
          9       0.00  

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1577
          1       0.00      0.00      0.00      1589
          2       0.00      0.00      0.00      1590
          3       0.00      0.00      0.00      1583
          4       0.00      0.00      0.00      1598
          5       0.00      0.00      0.00      1599
          6       0.00      0.00      0.00      1591
          7       0.00      0.00      0.00      1587
          8       0.07      1.00      0.14      1580
          9       0.00      0.00      0.00      1585
         10       0.00      0.00      0.00      1599
         11       0.00      0.00      0.00      1145
         12       0.00      0.00      0.00      1254
         13       0.00      0.00      0.00      1290

avg / total       0.01      0.07      0.01     21167

data_len= 21167  num_batch= 331
total_batch= 1001  total_loss= 1.33301044649  total_acc= 0.713469079162
[ 6  1  6  5  8 10  0  5  3 11]
[  6.   1.   

In [8]:
print("end")

end
