In [1]:
import os
import os.path
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf

import input_data_vw as input_data
import VGG
import tools

# 预训练好的npy模型
pre_trained_weights = '../vgg16_pretrain/vgg16.npy'
# 训练数据集合验证数据集的路径
data_dir = "/home/gps/HDD/dataset_awr/RTM_v4_LargeScale/trains/"
val_dir = "/home/gps/HDD/dataset_awr/RTM_v4_LargeScale/test/"
# 训练日志路径
train_log_dir = './logs/train//'
val_log_dir = './logs//val//'

# 训练
def train():
    IS_PRETRAIN = True
    # 定义一些参数
    N_CLASSES = 5
    IMG_W = 208  # resize图片, 如果图片太大, 训练就很慢
    IMG_H = 208
    BATCH_SIZE = 16
    CAPACITY = 2000 
    learning_rate = 0.003
    MAX_STEP = 15000   # 迭代的总步数 
    # 读取数据集
    with tf.name_scope('input'):
        tra_image_batch, tra_label_batch = input_data.get_batch(data_dir,
                                                          IMG_W,
                                                          IMG_H,
                                                          BATCH_SIZE, 
                                                          CAPACITY,
                                                          N_CLASSES)
        val_image_batch, val_label_batch = input_data.get_batch(val_dir, 
                                                          IMG_W,
                                                          IMG_H,
                                                          BATCH_SIZE, 
                                                          CAPACITY,
                                                          N_CLASSES)

    # 定义数据和标签的op
    data_ph = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3])
    label_ph = tf.placeholder(tf.int16, shape=[BATCH_SIZE, N_CLASSES])    
    # 调用搭建好的网络
    logits = VGG.VGG16N(tra_image_batch, N_CLASSES, IS_PRETRAIN)
    # 定义损失函数的op
    loss = tools.loss(logits, tra_label_batch)
    # 定义准确率的op
    accuracy = tools.accuracy(logits, tra_label_batch)
    # 定义全局step的op
    my_global_step = tf.Variable(0, name='global_step', trainable=False) 
    # 定义训练op
    train_op = tools.optimize(loss, learning_rate, my_global_step)
    # 定义存储op
    saver = tf.train.Saver(tf.global_variables())
    # 定义画图op
    summary_op = tf.summary.merge_all()   
    # 初始化所有tf的op
    init = tf.global_variables_initializer()
    # 定义一个tf会话的op
    sess = tf.Session()
    # 执行初始化
    sess.run(init)
    
    # 直接加载预先训练好的vgg模型参数, 但跳过特定的层
    tools.load_with_skip(pre_trained_weights, sess, ['fc6','fc7','fc8'])   

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)    
    tra_summary_writer = tf.summary.FileWriter(train_log_dir, sess.graph)
    val_summary_writer = tf.summary.FileWriter(val_log_dir, sess.graph)
    
    try:
        for step in np.arange(MAX_STEP):
            if coord.should_stop():
                break
            # tra_image_batch, tra_label_batch实际上都是input_data输出的tensor数据
            # 需要转换成numpy后才能feed给网络的train_op,loss_op和accuracy_op这些对象
            tra_images,tra_labels = sess.run([tra_image_batch, tra_label_batch])
            # 1.如果定义train_op, loss, accuracy这些op的时候未使用input_data输出的tensor数据作为这些op函数的输入，
            # 则必须用feed来把numpy数据喂给tensor的placeholder
            # 2.如果定义train_op, loss, accuracy这些op的时候使用了input_data输出的tensor数据作为这些op函数的输入，
            # 就可以不用feed_dict，但是用了也可以为其指定输入，以免数据多次迭代（推荐）
            _, tra_loss, tra_acc = sess.run([train_op, loss, accuracy],feed_dict={data_ph:tra_images, label_ph:tra_labels})
            # 每隔20步，记录一次loss和acc
            if step % 20 == 0 or (step + 1) == MAX_STEP:                 
                print ('Step: %d, loss: %.4f, accuracy: %.4f%%' % (step, tra_loss, tra_acc))
                summary_str = sess.run(summary_op)
                tra_summary_writer.add_summary(summary_str, step)
            # 每隔100步，用一批测试集的数据送入模型观察一下loss和acc   
            if step % 100 == 0 or (step + 1) == MAX_STEP:
                val_images, val_labels = sess.run([val_image_batch, val_label_batch])
                val_loss, val_acc = sess.run([loss, accuracy],feed_dict={data_ph:val_images,label_ph:val_labels})
                print('**  Step %d, val loss = %.2f, val accuracy = %.2f%%  **' %(step, val_loss, val_acc))
                summary_str = sess.run(summary_op)
                val_summary_writer.add_summary(summary_str, step)
                    
            if step % 500 == 0 or (step + 1) == MAX_STEP:
                checkpoint_path = os.path.join(train_log_dir,"model")
                saver.save(sess, checkpoint_path, global_step=step)
                evaluate(step)
                os.system("")
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()
        
    coord.join(threads)
    sess.close()





    
#%%   Test the accuracy on test dataset. got about 85.69% accuracy.
import math
def evaluate(train_step):
    IS_PRETRAIN = False
    with tf.Graph().as_default():

        log_dir = 'logs/train/'
        test_dir = val_dir
        n_test = len(os.listdir(test_dir))
        images, labels = input_data.get_batch(test_dir, IMG_W,IMG_H, BATCH_SIZE, CAPACITY,N_CLASSES)

        logits = VGG.VGG16N(images, N_CLASSES, IS_PRETRAIN)
        correct = tools.num_correct_prediction(logits, labels)
        saver = tf.train.Saver(tf.global_variables())
        
        with tf.Session() as sess:
            
            print("Reading checkpoints...")
            model = "logs/train/model-%d" % train_step
            if True:
                saver.restore(sess, model)
            else:
                print('No checkpoint file found')
                return
        
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess = sess, coord = coord)
            
            try:
                print('\nEvaluating......')
                num_step = int(math.floor(n_test / BATCH_SIZE))
                num_sample = num_step*BATCH_SIZE
                step = 0
                total_correct = 0
                while step < num_step and not coord.should_stop():
                    batch_correct = sess.run(correct)
                    total_correct += np.sum(batch_correct)
                    step += 1
                print('Total testing samples: %d' %num_sample)
                print('Total correct predictions: %d' %total_correct)
                print('Average accuracy: %.2f%%' %(100*total_correct/num_sample))
            except Exception as e:
                coord.request_stop(e)
            finally:
                coord.request_stop()
                coord.join(threads)
#%%

In [2]:
train()

Step: 0, loss: 1.4936, accuracy: 12.5000%


NameError: name 'x' is not defined