# 图片预处理

In [1]:
import os, shutil, platform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import shuffle 

%matplotlib inline 

## 工具函数:判断操作系统种类

In [2]:
print("当前工作目录是:")
print(os.getcwd())

当前工作目录是:
/home/ubuntu/cat_vs_dog_cnn


In [3]:
from PIL import Image
print(Image.__file__)

/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/PIL/Image.py


In [4]:
def is_Windows_OS():
    if platform.system() == 'Windows':
        return True
    else:
        return False

## 定义的目录结构变量

In [5]:
# 训练集图片从zip包解压之后存放的目录
train_images_folder = 'train'

# 训练集图片目录
train_set_folder = 'train1'
train_set_folder_cat = 'train1/cats/'
train_set_folder_dog = 'train1/dogs/'

# 验证集图片目录
validation_set_folder = 'valid'
validation_set_folder_cat = 'valid/cats/'
validation_set_folder_dog = 'valid/dogs/'

## 新建有关目录结构

In [6]:
def rmrf_mkdir(dirname):
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)

rmrf_mkdir(train_set_folder)
os.mkdir(train_set_folder_cat)
os.mkdir(train_set_folder_dog)

rmrf_mkdir(validation_set_folder)
os.mkdir(validation_set_folder_cat)
os.mkdir(validation_set_folder_dog)

## 加载训练集目录

In [7]:
train_filenames = os.listdir(train_images_folder)
train_cat = filter(lambda x:x[:3] == 'cat', train_filenames)
train_dog = filter(lambda x:x[:3] == 'dog', train_filenames)

In [8]:
print (type(train_cat))
print (len(train_filenames))

<class 'filter'>
25000


In [9]:
print (train_filenames[24999])

cat.2814.jpg


定义一组移动有用的函数：

In [10]:
num_of_images_valid_set = 2500
        
def move_cat_images():
    timer = 0
    for filename in train_cat:
        if timer < num_of_images_valid_set:
            shutil.copy2('train/'+filename, validation_set_folder_cat)
        else:
            shutil.copy2('train/'+filename, train_set_folder_cat)
        timer = timer + 1

def move_dog_images():
    timer = 0
    for filename in train_dog:
        if timer < num_of_images_valid_set:
            shutil.copy2('train/'+filename, validation_set_folder_dog)
        else:
            shutil.copy2('train/'+filename, train_set_folder_dog)
        timer = timer + 1  

In [11]:
def fill_train_set_valid_set_folder_with_images():
    move_cat_images()
    move_dog_images()

In [12]:
fill_train_set_valid_set_folder_with_images()
print ('fill_train_set_valid_set_folder_with_images执行完毕')

fill_train_set_valid_set_folder_with_images执行完毕


# 搭建并且编译模型

In [13]:
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.layers import Dropout
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [91]:
batch_size = 64
target_size = (299, 299)

# 不包含原有模型的全连接层
base_model = InceptionResNetV2(include_top=False, 
                             weights='imagenet',
                             input_shape = (299, 299, 3),
                             pooling='avg')

for layer in base_model.layers:
    layer.trainable = True
# 设置预训练模型最后84层可以训练
for layer in base_model.layers[:-119]:
    layer.trainable = False

x = base_model.output

x = Dropout(0.5, name='dropout')(x)

# Classifier
predictions = Dense(units = 1, activation='sigmoid',  name='predictions')(x)

model = Model(inputs=base_model.input, outputs=predictions)

# Compiling the CNN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [109]:
for i in range(len(base_model.layers)):
    print(i,base_model.layers[i].name)

0 input_5
1 conv2d_813
2 batch_normalization_813
3 activation_813
4 conv2d_814
5 batch_normalization_814
6 activation_814
7 conv2d_815
8 batch_normalization_815
9 activation_815
10 max_pooling2d_17
11 conv2d_816
12 batch_normalization_816
13 activation_816
14 conv2d_817
15 batch_normalization_817
16 activation_817
17 max_pooling2d_18
18 conv2d_821
19 batch_normalization_821
20 activation_821
21 conv2d_819
22 conv2d_822
23 batch_normalization_819
24 batch_normalization_822
25 activation_819
26 activation_822
27 average_pooling2d_5
28 conv2d_818
29 conv2d_820
30 conv2d_823
31 conv2d_824
32 batch_normalization_818
33 batch_normalization_820
34 batch_normalization_823
35 batch_normalization_824
36 activation_818
37 activation_820
38 activation_823
39 activation_824
40 mixed_5b
41 conv2d_828
42 batch_normalization_828
43 activation_828
44 conv2d_826
45 conv2d_829
46 batch_normalization_826
47 batch_normalization_829
48 activation_826
49 activation_829
50 conv2d_825
51 conv2d_827
52 conv2d_8

In [96]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
conv2d_813 (Conv2D)             (None, 149, 149, 32) 864         input_5[0][0]                    
__________________________________________________________________________________________________
batch_normalization_813 (BatchN (None, 149, 149, 32) 96          conv2d_813[0][0]                 
__________________________________________________________________________________________________
activation_813 (Activation)     (None, 149, 149, 32) 0           batch_normalization_813[0][0]    
__________________________________________________________________________________________________
conv2d_814

## 训练模型

In [97]:
train_image_gen = ImageDataGenerator(rotation_range=40,
                                     width_shift_range=0.2,
                                     height_shift_range=0.2,
                                     rescale = 1./255,
                                     shear_range = 0.2,
                                     zoom_range = 0.2,
                                     horizontal_flip = True)
print("训练集图片:")
training_set = train_image_gen.flow_from_directory('train1',
                                                 target_size = target_size,
                                                 batch_size = batch_size,
                                                 color_mode="rgb",
                                                 class_mode = 'binary')

valid_image_gen = ImageDataGenerator(rotation_range=40,
                                     width_shift_range=0.2,
                                     height_shift_range=0.2,
                                     rescale = 1./255,
                                     shear_range = 0.2,
                                     zoom_range = 0.2,
                                     horizontal_flip = True)
print("验证集图片:")
validation_set = valid_image_gen.flow_from_directory('valid',
                                                 target_size = target_size,
                                                 batch_size = batch_size,
                                                 color_mode="rgb",
                                                 class_mode = 'binary')


训练集图片:
Found 20000 images belonging to 2 classes.
验证集图片:
Found 5000 images belonging to 2 classes.


## 开始训练模型

In [98]:
# 训练集图片总量
train_size = 22000

# 每一轮训练需要执行的步长
steps_per_epoch = train_size/batch_size

# 验证集图片总量
valid_size = 3000
# 每一轮验证需要执行的步长
validation_steps = valid_size/batch_size

# 增加检查点，设置模型早停
callbacks = [EarlyStopping(monitor='val_loss', patience=10),
             ModelCheckpoint(filepath='final_model_weights.h5', monitor='val_loss', save_best_only=True)]

# 使用模型拟合图片数据
model.fit_generator(training_set, epochs = 10, steps_per_epoch = steps_per_epoch, callbacks=callbacks,
                    validation_data=validation_set, validation_steps = validation_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2234734cf8>

## 开始预测

In [99]:
# 测试集图片从zip包解压缩之后的存放目录
test_set_folder = 'test'
test_folder ="test1"
test_inner_folder = "test1/test"
rmrf_mkdir(test_folder)
os.mkdir(test_inner_folder)

In [100]:
def move_test_images():
    test_filenames = os.listdir(test_set_folder)
    for filename in test_filenames:
        shutil.copy2( test_set_folder + '/' + filename, test_inner_folder)


In [101]:
move_test_images()
print("开始导入测试集图片:")
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(test_folder,
                                                  target_size=target_size, 
                                                  batch_size=batch_size,
                                                  shuffle = False, class_mode=None)

print("测试目录准备完成！")

开始导入测试集图片:
Found 12500 images belonging to 1 classes.
测试目录准备完成！


## 备份模型到`json`文件

In [102]:
from keras.models import model_from_json

# 保存模型结构
model_json = model.to_json()
with open("best_model.json", "w") as json_file:
    json_file.write(model_json)
    
print("模型已经保存到json文件！")

模型已经保存到json文件！


# 加载模型结构和权重

In [103]:
# 使用新的模型结构加载模型权重
json_file = open("best_model.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
print("loaded_model重新加载模型权重！")
loaded_model.load_weights('final_model_weights.h5')
print("loaded_model重新加载模型权重完成！")

loaded_model重新加载模型权重！
loaded_model重新加载模型权重完成！


In [104]:
print("预测执行开始！")
pred_result = loaded_model.predict_generator(test_generator, verbose=1)
print("预测执行完毕！")

预测执行开始！
预测执行完毕！


In [105]:
test_generator.filenames[:10]

['test/1.jpg',
 'test/10.jpg',
 'test/100.jpg',
 'test/1000.jpg',
 'test/10000.jpg',
 'test/10001.jpg',
 'test/10002.jpg',
 'test/10003.jpg',
 'test/10004.jpg',
 'test/10005.jpg']

In [106]:
pred_result[:10]

array([[1.0000000e+00],
       [4.9252883e-08],
       [4.0796699e-10],
       [1.0000000e+00],
       [9.9999988e-01],
       [6.9182290e-12],
       [2.9570932e-09],
       [9.9999905e-01],
       [9.9589610e-01],
       [3.9996267e-13]], dtype=float32)

## 导出预测结果

In [107]:
import pandas as pd
from keras.preprocessing.image import *

solution = pd.read_csv("sample_submission.csv")

for i, fname in enumerate(test_generator.filenames):
        index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
        solution.set_value(index-1, 'label', pred_result[i])

print("导出结果完成！")

导出结果完成！




In [108]:
solution.to_csv("pred-14.csv", index = False)
solution.head(10)

Unnamed: 0,id,label
0,1,1.0
1,2,1.0
2,3,0.9999994
3,4,0.9999448
4,5,2.559689e-08
5,6,6.249009e-10
6,7,1.601288e-11
7,8,1.626876e-08
8,9,8.064261e-12
9,10,4.925288e-08
