In [1]:
# -*- coding: utf8 -*-
import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics
from gensim.models import Word2Vec

from CNNDiseaseModel import CNNDisease,TCNNConfig
from dataOwn import *
import pickle


In [2]:
word2vec_model_path = "../modelKey/word2VecModelsh.bin15_100_1e-05_15"
vocabulary_word2index, vocabulary_index2word  = create_voabulary(word2vec_model_path)

In [3]:
def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.dropout_keep_prob: keep_prob
    }
    return feed_dict


def evaluate(sess, x, y):
    """评估在某一数据上的准确率和损失"""
    #batch_eval = batch_iter(x_, y_, config.batch_size)
    batch_size = config.batch_size
    total_loss = 0.0
    total_acc = 0.0
    eval_out = []
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    print("data_len=", data_len, " num_batch=", num_batch)
    x = np.array(x)
    y = np.array(y)
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        x_batch, y_batch =  x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc, y_pred_cls = sess.run([model.loss, model.acc, model.y_pred_cls], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len
        eval_out = np.concatenate([eval_out, y_pred_cls])

    return total_loss / data_len, total_acc / data_len, eval_out, y_shuffle
                                 

In [4]:
def test():
    print("Loading test data...")
    start_time = time.time()
    #x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = loadTrainOrTest_data_oneLabel(test_data_path, test_label_path, vocabulary_word2index,dictPath)
    x_val = pad_sequences(x_val, sequence_length)  # padding to max length
    
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    
    print('Testing...')
    total_loss, total_acc, eval_out, y_real = evaluate(session, x_val, y_val)
    print("total_loss=", total_loss, " total_acc=", total_acc)
    print(y_real[0:10])
    print(eval_out[0:10])
    print(classification_report(y_real,eval_out))
    
    cm = metrics.confusion_matrix(y_real, eval_out)
    print("cm====\n", cm)
    
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)          

In [5]:
print('Configuring RNN model...')
config = TCNNConfig()


Configuring RNN model...


In [6]:

config.vocab_size = len(vocabulary_word2index)+1
model = CNNDisease(config)

In [7]:
# Load data
print("Loading data...")
x_train, y_train = loadTrainOrTest_data_oneLabel(config.train_data_path, config.train_label_path,vocabulary_word2index)
x_val, y_val = loadTrainOrTest_data_oneLabel(config.test_data_path, config.test_label_path, vocabulary_word2index)

Loading data...


In [8]:
# 2.Data preprocessing.Sequence padding
print("start padding ...")
x_train = pad_sequences(x_train, config.sequence_length)  # padding to max length
x_val = pad_sequences(x_val, config.sequence_length)  # padding to max length

start padding ...


In [9]:
word2vecModel = Word2Vec.load(config.word2vec_model_path)

In [10]:

save_dir = 'checkpoints/textrnn'
save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径

In [11]:
from sklearn.metrics import classification_report
def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    #x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    #x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)
    
    #y_real = session.run(tf.argmax(y_val, 1, name="predictions_real"))
    
    if config.use_embedding:  # load pre-trained word embedding
        assign_pretrained_word_embedding(session, model, word2vecModel,config.embed_size )

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
            #print("total_batch=", total_batch,"config.save_per_batch=",config.save_per_batch)
            #print("total_batch=",total_batch,config.print_per_batch )
            if total_batch % config.print_per_batch == 1:
                # 每多少轮次输出在训练集和验证集上的性能
                total_loss, total_acc, eval_out, y_real = evaluate(session, x_val, y_val)
                #y_realNew = session.run(y_real)
                print("total_batch=", total_batch, " total_loss=", total_loss, " total_acc=", total_acc)
                print(y_real[0:10])
                print(eval_out[0:10])
                print(classification_report(y_real,eval_out))
    
                if total_acc > best_acc_val:
                    # 保存最好结果
                    best_acc_val = total_acc
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                    print("==========best==========")

                    print(classification_report(y_real,eval_out))
                else:
                    improved_str = ''
            if total_batch % config.save_per_batch == 1:
                # 每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)


                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                #print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.train_op, feed_dict=feed_dict)  # 运行优化
            #logits = session.run(model.logits, feed_dict = feed_dict)
            #print(logits[0:10])
            #y_pred_cls = session.run(model.y_pred_cls, feed_dict = feed_dict)
            #print(y_pred_cls[0:10])
            total_batch += 1

            #if total_batch - last_improved > require_improvement:
            #    # 验证集正确率长期不提升，提前结束训练
            #    print("No optimization for a long time, auto-stopping...")
            #    flag = True
            #    break  # 跳出循环
        
        if flag:  # 同上
            break
            

In [12]:
train()

Configuring TensorBoard and Saver...
Loading training and validation data...
Time usage: 0:00:00
vocab_size= 232014
word. exists embedding: 232014  ;word not exist embedding: 0
using pre-trained word emebedding.ended...
Training and evaluating...
Epoch: 1
data_len= 86095  num_batch= 1346
data_len= 21167  num_batch= 331
total_batch= 1  total_loss= 70.9931519089  total_acc= 0.0884867954776
[ 1 11 11  0  0  0 10 12 10 13]
[ 1.  1.  1.  4.  1.  1.  1.  1.  1.  1.]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1577
          1       0.08      0.88      0.15      1589
          2       0.05      0.01      0.01      1590
          3       0.00      0.00      0.00      1583
          4       0.13      0.25      0.17      1598
          5       0.00      0.00      0.00      1599
          6       0.05      0.00      0.00      1591
          7       0.00      0.00      0.00      1587
          8       0.17      0.00      0.00      1580
    

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1577
          1       0.08      0.88      0.15      1589
          2       0.05      0.01      0.01      1590
          3       0.00      0.00      0.00      1583
          4       0.13      0.25      0.17      1598
          5       0.00      0.00      0.00      1599
          6       0.05      0.00      0.00      1591
          7       0.00      0.00      0.00      1587
          8       0.17      0.00      0.00      1580
          9       0.00      0.00      0.00      1585
         10       0.08      0.04      0.05      1599
         11       0.00      0.00      0.00      1145
         12       0.00      0.00      0.00      1254
         13       0.00      0.00      0.00      1290

avg / total       0.04      0.09      0.03     21167

data_len= 21167  num_batch= 331
total_batch= 1001  total_loss= 2.01986750473  total_acc= 0.665564321796
[13  7 11  2  2  0  3  0  3  1]
[  2.   2.  1