In [1]:
# -*- coding: utf8 -*-
from CNNDiseaseModel import CNNDisease
from CNNdata_helpers import *

In [2]:
from gensim.models import Word2Vec
import pickle
import tensorflow as tf
import numpy as np
import datetime

In [3]:
tf.flags.DEFINE_integer("num_filters", 32,"number of filter each filter_size")
tf.flags.DEFINE_integer("num_classes", 22, "number of labels")
tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate")
tf.flags.DEFINE_integer("batch_size", 64, "batch size for train or evaluate")
tf.flags.DEFINE_integer("sequence_length", 81, "max sequence_length")
tf.flags.DEFINE_integer("embed_size",100, "embedding size" )
tf.flags.DEFINE_integer("num_epochs", 20, "number of epochs to train.")
tf.flags.DEFINE_boolean("is_training", True,"if is train step")
tf.app.flags.DEFINE_integer("decay_steps", 3500, "how many steps before decay learning rate.")
tf.app.flags.DEFINE_float("decay_rate", 0.65, "Rate of decay for learning rate.")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

tf.app.flags.DEFINE_string("train_data_path", "./cnnModel/datasets/trainall.feature",
                           "path of traning data.")
tf.app.flags.DEFINE_string("train_label_path", "./cnnModel/datasets/trainall.label",
                           "path of labels of traning data.")
tf.flags.DEFINE_string("test_data_path", "./cnnModel/datasets/testall.feature", "Test data source")
tf.flags.DEFINE_string("test_label_path", "./cnnModel/datasets/testall.label", "Label for test data")
tf.app.flags.DEFINE_string("word2vec_model_path", "./model/word2VecModelsh.bin5_100_1e-05_15",
                           "word2vec's vocabulary and vectors")
tf.flags.DEFINE_integer("num_checkpoints", 5, "number of checkpoints")
tf.flags.DEFINE_boolean("use_embedding", True,"if use pre trained word2vec embedding")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "dropout_keep_prob")
tf.flags.DEFINE_integer("validate_every", 5, "Validate every validate_every epochs.")
tf.flags.DEFINE_string("ckpt_dir", "./runs/cnn_disease_checkpoint3in/",
                           "checkpoint location for the model")
FLAGS = tf.flags.FLAGS


In [4]:
def create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path, name_scope=''):
    cache_path = './cache_vocabulary_label_pik3in/' + name_scope + "_word_voabulary.pik"
    #print("cache_path:", cache_path, "file_exists:", os.path.exists(cache_path))
    # load the cache file if exists
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as data_f:
            vocabulary_word2index, vocabulary_index2word = pickle.load(data_f)
            return vocabulary_word2index, vocabulary_index2word
    else:
        vocabulary_word2index = {}
        vocabulary_index2word = {}
        model = Word2Vec.load(word2vec_model_path)
        print("vocabulary:", len(model.wv.vocab))
        vocabulary_word2index["0"] = 0
        vocabulary_index2word[0] = "0"
        for i, vocab in enumerate(model.wv.vocab):
            vocabulary_word2index[vocab] = i + 1
            vocabulary_index2word[i + 1] = vocab

        # save to file system if vocabulary of words is not exists.
        print(len(vocabulary_word2index))
        if not os.path.exists(cache_path):
            with open(cache_path, 'wb') as data_f:
                pickle.dump((vocabulary_word2index, vocabulary_index2word), data_f)
    return vocabulary_word2index, vocabulary_index2word

In [5]:
def assign_pretrained_word_embedding(sess, cnnDisease, word2vec_model):
    print("using pre-trained word emebedding.started.word2vec_model_path:", FLAGS.word2vec_model_path)
    word2vec_dict = {}
    vocab_size = len(word2vec_model.wv.index2word)
    print("vocab_size=",vocab_size)
    
    word_embedding_2dlist = [[]] * (vocab_size+1)  # create an empty word_embedding list.
    bound = np.sqrt(6.0) / np.sqrt(vocab_size)
    count_exist = 0
    count_not_exist = 0
    word_embedding_2dlist[0] = np.random.uniform(-bound, bound, FLAGS.embed_size);
    for i, word in enumerate(model.wv.vocab):
    #for i in range(vocab_size):
        #word = word2vec_model.wv.index2word[i]
        embedding = None
        try:
            embedding = word2vec_model.wv[word]
        except:
            embedding = None
        if embedding is not None:
            word_embedding_2dlist[i+1] = embedding
            count_exist += 1
        else:
            word_embedding_2dlist[i+1] = np.random.uniform(-bound, bound, FLAGS.embed_size);
            count_not_exist += 1
        
    word_embedding_final = np.array(word_embedding_2dlist)  # covert to 2d array.
    word_embedding = tf.constant(word_embedding_final, dtype=tf.float32)  # convert to tensor
    t_assign_embedding = tf.assign(cnnDisease.Embedding,
                                   word_embedding)  # assign this value to our embedding variables of our model.
    sess.run(t_assign_embedding)
    print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist)
    print("using pre-trained word emebedding.ended...")

In [6]:
def load_data(train_data_path, train_label_path, test_data_path, test_label_path, vocabulary_word2index):
    print("Loading data...")
    x_train, y_train = loadTrainOrTest_data(train_data_path, train_label_path,vocabulary_word2index)
    x_test, y_test = loadTrainOrTest_data(test_data_path, test_label_path, vocabulary_word2index)
    train = (x_train, y_train)
    test = (x_test, y_test)
    return train, test

In [7]:
# Make evaluation on test set
def do_eval(sess, cnnDisease, evalX, evalY, batch_size):
    number_examples = len(evalX)
    eval_loss, eval_acc, eval_counter = 0.0, 0.0, 0
    for start, end in zip(range(0, number_examples, batch_size), range(batch_size, number_examples, batch_size)):
        feed_dict = {cnnDisease.input_x: evalX[start:end], cnnDisease.dropout_keep_prob: 1.0}
        feed_dict[cnnDisease.input_y] = evalY[start:end]
        curr_eval_loss, logits, curr_eval_acc = sess.run([cnnDisease.loss_val, cnnDisease.logits, cnnDisease.accuracy],
                                                         feed_dict)
        eval_loss, eval_acc, eval_counter = eval_loss + curr_eval_loss, eval_acc + curr_eval_acc, eval_counter + 1
    return eval_loss / float(eval_counter), eval_acc / float(eval_counter)

In [8]:
f = open(FLAGS.train_data_path,"r")
count = 0
for line in f:
    #print(line)
    count +=1
    #print(count)

In [9]:
    # 1.load data(X:list of lint,y:int).
    model = Word2Vec.load(FLAGS.word2vec_model_path)
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_model_path=FLAGS.word2vec_model_path, 
        name_scope="cnn")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("vocabulary_word2index.vocab_size:", vocab_size)
    vocab_size = len(model.wv.index2word)
    print("cnn_model.vocab_size:", vocab_size)
    trainX, trainY, testX, testY = None, None, None, None
    train, test = load_data(train_data_path=FLAGS.train_data_path, train_label_path=FLAGS.train_label_path,
                            test_data_path=FLAGS.test_data_path, test_label_path=FLAGS.test_label_path,
                            vocabulary_word2index=vocabulary_word2index)
    trainX, trainY = train
    testX, testY = test  

vocabulary_word2index.vocab_size: 192413
cnn_model.vocab_size: 192413
Loading data...


In [10]:
    # 2.Data preprocessing.Sequence padding
    print("start padding & transform to one hot...")
    trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length)  # padding to max length
    testX = pad_sequences(testX, maxlen=FLAGS.sequence_length)  # padding to max length

start padding & transform to one hot...
maxlen= 81
maxlen= 81


In [11]:
import random
x = [i for i in range(10) ]
print(x)
random.shuffle(x)
print(x)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[7, 2, 1, 0, 3, 4, 6, 8, 5, 9]


In [12]:
    import random
    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        filter_sizes = [3,4,5]
        cnnDisease = CNNDisease(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size,
                          FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,
                          FLAGS.is_training)
        # Initialize Save
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
        print('Initializing Variables')
        sess.run(tf.global_variables_initializer())
        
        if FLAGS.use_embedding:  # load pre-trained word embedding
            assign_pretrained_word_embedding(sess, cnnDisease, model)
        curr_epoch = sess.run(cnnDisease.epoch_step)
       
        print("curr_epoch=", curr_epoch)
        number_of_training_data = len(trainX)
        print("number_of_training_data=",number_of_training_data)
       
        batch_size = FLAGS.batch_size
        print("batch_size=", batch_size)
        #4 feed data
        for epoch in range(curr_epoch, FLAGS.num_epochs + 1):
            loss, acc, counter = 0.0, 0.0, 0
            '''
            indexList = [i for i in range(number_of_training_data)]
            random.shuffle(indexList)
            x = trainX
            y = trainY
            for i in range(number_of_training_data):
                x[i] = trainX[indexList[i]]
                y[i] = trainY[indexList[i]]
            trainX = x
            trainY = y
            '''
            #每个epoch ，shuffle数据
            np.random.seed(10) 
            shuffle_indices = np.random.permutation(np.arange(number_of_training_data))
            x = trainX[shuffle_indices]
            y = trainY[shuffle_indices]
            trainX = x
            trainY = y
                                                    
            for start, end in zip(range(0, number_of_training_data, batch_size),
                                  range(batch_size, number_of_training_data, batch_size)):
                if counter == 0:
                    print("trainX[start:end]:", trainX[start:2])  # ;print("trainY[start:end]:",trainY[start:end])
                #use the word index as the input 
                feed_dict = {cnnDisease.input_x: trainX[start:end], cnnDisease.dropout_keep_prob: FLAGS.dropout_keep_prob}

                feed_dict[cnnDisease.input_y] = trainY[start:end]
                #5 training 
                curr_loss, curr_acc, _ = sess.run([cnnDisease.loss_val, cnnDisease.accuracy, cnnDisease.train_op],
                                                 feed_dict)
                
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 100 == 0:
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (
                    epoch, counter, loss / float(counter), acc / float(counter)))

            # epoch increment
            print("going to increment epoch counter....")
            sess.run(cnnDisease.epoch_increment)
        
             # 6.validation
            print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, cnnDisease, testX, testY, batch_size)
                time_str = datetime.datetime.now().isoformat()
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f\t time: %s" % (
                epoch, eval_loss, eval_acc, time_str))
                # save model to checkpoint
                if not os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
                    os.makedirs(FLAGS.ckpt_dir)
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)


learing rate: <tf.Variable 'learning_rate:0' shape=() dtype=float32_ref>
global_step: <tf.Variable 'global_step:0' shape=() dtype=int32_ref>
decay_steps: 3500
decay_rate: 0.65
decay_steps=3500 decay_rate=0.650000
Initializing Variables
using pre-trained word emebedding.started.word2vec_model_path: ./model/word2VecModelsh.bin5_100_1e-05_15
vocab_size= 192413
word. exists embedding: 192413  ;word not exist embedding: 0
using pre-trained word emebedding.ended...
curr_epoch= 0
number_of_training_data= 46239
batch_size= 64
trainX[start:end]: [[169772  13919 121828  98928 117592  61202 100656  49430  10186 192114
  172887  35237  98333 113032 149534  29235 153913 189587 154865  18223
   31658 120780 147987  52378 103837   4155  23245 106036  25200 112186
   33199  76013 121102 147540  23453   7027 172435  54476   4382  33088
    7487 117592   6272 138917  76121 169772  17923  56255  49430  68699
   68699  68699  68699  68699  68699  68699  68699  68699  68699  68699
   68699 117592   6272 13