# 迁移学习---使用google Inception-v3预训练模型实现花的分类

## 1. 下载数据集

```
curl http://download.tensorflow.org/example_images/flower_photos.tgz
tar xzf flower_photos.tgz
```

## 2.下载预训练的Inception-v3 模型

```
wget https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015.zip
unzip tensorflow/examples/label_image/data/inception_dec_2015.zip
```

## 3. 实现迁移学习代码

In [2]:
import glob
import os.path
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.platform import gfile

In [3]:
BOTTLENECK_TENSOR_SIZE = 2048
BOTTLENECK_TENSOR_NAME = 'pool_3/_reshape:0'

JPEG_DATA_TENSOR_NAME = 'DecodeJpeg/contents:0'

#下载的Inception-v3 模型文件目录和模型文件名
MODEL_DIR = './models/'
MODEL_FILE = 'tensorflow_inception_graph.pb'

CACHE_DIR = '/tmp/bottleneck'

# 图片数据文件夹,文件夹下没一个子文件夹代表一个需要区分的类别
INPUT_DATA = './flower_photos/'
# 验证集和测试集的数据百分比
VALIDATION_PERCENTAGE = 10
TEST_PERCENTAGE = 10

# 神经网络的设置
LEARNING_RATE = 0.01
STEPS = 4000
BATCH = 100

In [4]:
# 从数据集中读取所有图片列表，并按train, validation, test 分开
def create_image_lists(testing_percentage, validation_percentage):
    result = {}
    #获取当前目录下所有子目录
    sub_dirs = [x[0] for x in os.walk(INPUT_DATA)]
    is_root_dir = True
    for sub_dir in sub_dirs:
        if is_root_dir:
            is_root_dir = False
            continue
        # 获取当前目录下所有有效图片文件
        extensions = ['jpg', 'jpeg', 'JPG','JPEG']
        file_list =[]
        dir_name = os.path.basename(sub_dir)
        for extension in extensions:
            file_glob =  os.path.join(INPUT_DATA, dir_name, '*.'+extension)
            file_list.extend(glob.glob(file_glob))
        if not file_list:
            continue
        
        # 通过目录名获取类别名
        label_name = dir_name.lower()
        training_images = []
        testing_images = []
        validation_images = []
        for file_name in file_list:
            base_name= os.path.basename(file_name)
            chance = np.random.randint(100)
            if chance< validation_percentage:
                validation_images.append(base_name)
            elif chance<(validation_percentage+testing_percentage):
                testing_images.append(base_name)
            else:
                training_images.append(base_name)
                
        result[label_name] = {
            'dir': dir_name,
            'training': training_images,
            'testing': testing_images,
            'validation': validation_images,
        }
    return result

In [5]:
image_lists = create_image_lists(TEST_PERCENTAGE, VALIDATION_PERCENTAGE)
image_lists['daisy']

{'dir': 'daisy',
 'testing': ['8383753520_8391dd80ee_m.jpg',
  '433837534_1dbf798b73.jpg',
  '9346508462_f0af3163f4.jpg',
  '3758221664_b19116d61f.jpg',
  '512177035_70afc925c8.jpg',
  '5577555349_2e8490259b.jpg',
  '8694909523_3ca25d449d_n.jpg',
  '8489463746_a9839bf7e4.jpg',
  '2001380507_19488ff96a_n.jpg',
  '9595857626_979c45e5bf_n.jpg',
  '5722473541_ffac1ae67e_n.jpg',
  '5058708968_8bdcd29e63_n.jpg',
  '6596277835_9f86da54bb.jpg',
  '20182559506_40a112f762.jpg',
  '5561775629_a2b709b3a4_n.jpg',
  '54377391_15648e8d18.jpg',
  '8887005939_b19e8305ee.jpg',
  '16161045294_70c76ce846_n.jpg',
  '433037739_6a030e5912.jpg',
  '4432271543_01c56ca3a9.jpg',
  '2590291468_2635d3e4e0_n.jpg',
  '799952628_bf836677fa_n.jpg',
  '1441939151_b271408c8d_n.jpg',
  '20329326505_a777c71cc2.jpg',
  '3379332157_04724f6480.jpg',
  '3415180846_d7b5cced14_m.jpg',
  '10993710036_2033222c91.jpg',
  '14621687774_ec52811acd_n.jpg',
  '4683997791_56e7d3c03c_n.jpg',
  '4727955343_0bb23ac4ae.jpg',
  '5110107234_1

In [6]:
#获取图片地址
def get_image_path(image_lists, image_dir, label_name,index, category):
    label_lists = image_lists[label_name]
    category_list = label_lists[category]
    mod_index = index%len(category_list)\
    # get image name
    base_name = category_list[mod_index]
    sub_dir = label_lists['dir']
    full_path =os.path.join(image_dir, sub_dir, base_name)
    return full_path

In [7]:
get_image_path(image_lists, INPUT_DATA, 'daisy', 1, 'training')

'./flower_photos/daisy/14147016029_8d3cf2414e.jpg'

In [8]:
#获取 经Inception-v3 模型处理后的特征向量文件地址
def get_bottleneck_path(image_lists, label_name, index, category):
    return get_image_path(image_lists, CACHE_DIR, label_name, index, category) + '.txt'

# 用Inception-v3模型处理一张图片，获得其特征向量
def run_bottleneck_on_image(sess, image_data, image_data_tensor, bottleneck_tensor):
    bottleneck_values = sess.run(bottleneck_tensor, {image_data_tensor:image_data})
    bottleneck_values = np.squeeze(bottleneck_values)
    return bottleneck_values


In [9]:
# 获得图片经过Inception-V3模型处理之后的特征向量
def get_or_create_bottleneck(sess, image_lists, label_name, index, category, jpeg_data_tensor, bottleneck_tensor):
    label_lists = image_lists[label_name]
    sub_dir = label_lists['dir']
    sub_dir_path = os.path.join(CACHE_DIR, sub_dir)
    if not os.path.exists(sub_dir_path):
        os.makedirs(sub_dir_path)
    bottleneck_path = get_bottleneck_path(image_lists, label_name, index, category)
    
    if not os.path.exists(bottleneck_path):
        image_path = get_image_path(image_lists, INPUT_DATA, label_name, index, category)
        # get image data
        image_data = gfile.FastGFile(image_path, 'rb').read()
        bottleneck_values = run_bottleneck_on_image(sess, image_data, jpeg_data_tensor, bottleneck_tensor)
        bottleneck_string = ','.join(str(x) for x in bottleneck_values)
        with open(bottleneck_path, 'w') as bottleneck_file:
            bottleneck_file.write(bottleneck_string)
    else:
        with open(bottleneck_path, 'r') as bottleneck_file:
            bottleneck_string = bottleneck_file.read()
        bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
        
    return bottleneck_values

In [10]:
# 随机获取一个batch的图片作为训练数据
def  get_random_cached_bottlenecks(sess, n_classes, image_lists, how_many, category, jpeg_data_tensor, bottleneck_tensor):
    bottlenecks = []
    ground_truths = []
    for _ in range(how_many):
        label_index = random.randrange(n_classes)
        label_name = list(image_lists.keys())[label_index]
        image_index = random.randrange(65536)
        bottleneck = get_or_create_bottleneck(sess, image_lists, label_name,
                                              image_index, category, jpeg_data_tensor, bottleneck_tensor)
        ground_truth = np.zeros(n_classes, dtype=np.float32)
        ground_truth[label_index] = 1.0
        ground_truths.append(ground_truth)
        bottlenecks.append(bottleneck)
    return bottlenecks, ground_truths

In [11]:
# 获取全部测试数据
def get_test_bottlenecks(sess, image_lists, n_classes, jpeg_data_tensor, bottleneck_tensor):
    bottlenecks = []
    ground_truths = []
    label_name_list = list(image_lists.keys())
    for label_index, label_name in enumerate(label_name_list):
        category = 'testing'
        for index, unused_base_name in enumerate(image_lists[label_name][category]):
            bottleneck = get_or_create_bottleneck(sess, image_lists, label_name, index, 
                                                  category, jpeg_data_tensor, bottleneck_tensor)
            ground_truth = np.zeros(n_classes, dtype=np.float32)
            ground_truth[label_index] = 1.0
            ground_truths.append(ground_truth)
            bottlenecks.append(bottleneck)
    return bottlenecks, ground_truths

In [12]:
def main():
    image_lists = create_image_lists(TEST_PERCENTAGE, VALIDATION_PERCENTAGE)
    n_classes = len(image_lists.keys())
    # 读取已经训练好的Incepption-v3模型
    with gfile.FastGFile(os.path.join(MODEL_DIR, MODEL_FILE), 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    bottleneck_tensor, jpeg_data_tensor = tf.import_graph_def(graph_def, 
                                                              return_elements=[BOTTLENECK_TENSOR_NAME,JPEG_DATA_TENSOR_NAME])
    
    # 定义新的神经网络输入
    bottleneck_input = tf.placeholder(tf.float32, [None, BOTTLENECK_TENSOR_SIZE], name = 'BottleneckInputPlaceholder')
    ground_truth_input = tf.placeholder(tf.float32, [None, n_classes], name='GroundTruthIInput')
    
    # 新定义一个全连接层来分类
    with tf.name_scope('final_training_ops'):
        weights = tf.Variable(tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, n_classes], stddev=0.001))
        biases = tf.Variable(tf.zeros([n_classes]))
        logits = tf.matmul(bottleneck_input, weights)+ biases
        final_tensor = tf.nn.softmax(logits)
        
    # loss
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_input,logits=logits)
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    train_step = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy_mean)
    
    # 计算准确率
    with tf.name_scope('evaluation'):
        correct_prediction = tf.equal(tf.argmax(final_tensor, 1), tf.argmax(ground_truth_input, 1))
        evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        
        # training step
        for i in range(STEPS):
            train_bottlenecks, train_ground_truth = get_random_cached_bottlenecks(sess, n_classes, image_lists, BATCH, 
                                                                                  'training', jpeg_data_tensor, bottleneck_tensor)
            sess.run(train_step, feed_dict={bottleneck_input:train_bottlenecks, ground_truth_input: train_ground_truth})
            
            # 验证机上的正确率
            if i%100 == 0 or i+1 == STEPS:
                validation_bottlenecks, validation_ground_truth = get_random_cached_bottlenecks(sess, n_classes, image_lists,
                                                                                               BATCH, 'validation', jpeg_data_tensor, bottleneck_tensor)
                validation_accuracy = sess.run(evaluation_step, 
                                               feed_dict={bottleneck_input: validation_bottlenecks, ground_truth_input:validation_ground_truth})
                print('Step %d: Validation accuracy on random sampled %d examples= %.lf%%'%(i, BATCH, validation_accuracy*100))
                
        # 测试集上验证数据
        test_bottlenecks, test_ground_truth = get_test_bottlenecks(sess, image_lists, n_classes, jpeg_data_tensor, bottleneck_tensor)
        test_accuracy =sess.run(evaluation_step, 
                                               feed_dict={bottleneck_input: test_bottlenecks, ground_truth_input:test_ground_truth})
        print('Final test accuracy = %.lf%%'%(test_accuracy*100))
            

In [13]:
main()

Step 0: Validation accuracy on random sampled 100 examples= 43%
Step 100: Validation accuracy on random sampled 100 examples= 80%
Step 200: Validation accuracy on random sampled 100 examples= 86%
Step 300: Validation accuracy on random sampled 100 examples= 88%
Step 400: Validation accuracy on random sampled 100 examples= 81%
Step 500: Validation accuracy on random sampled 100 examples= 90%
Step 600: Validation accuracy on random sampled 100 examples= 86%
Step 700: Validation accuracy on random sampled 100 examples= 88%
Step 800: Validation accuracy on random sampled 100 examples= 89%
Step 900: Validation accuracy on random sampled 100 examples= 85%
Step 1000: Validation accuracy on random sampled 100 examples= 85%
Step 1100: Validation accuracy on random sampled 100 examples= 82%
Step 1200: Validation accuracy on random sampled 100 examples= 93%
Step 1300: Validation accuracy on random sampled 100 examples= 93%
Step 1400: Validation accuracy on random sampled 100 examples= 92%
Step 15