# 机器学习纳米学位
## 猫狗大战 Dog vs Cat
### 一、 分析数据，准备数据
首先从[kaggle](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data)上下载好数据包:test.zip和train.zip，对包进行解压缩与数据预处理。
#### 1、解压缩文件
注意将zip文件放到该ipythonnotebook同级别目录下：

In [None]:
import zipfile
import os
def un_zip(file_name):  
    """unzip zip file"""
    if not os.path.isdir(file_name):
        zip_file = zipfile.ZipFile(file_name+".zip")  
        zip_file.extractall()
        zip_file.close()
train_dir = 'train'
test_dir = 'test'
un_zip(test_dir)
un_zip(train_dir)

观察解压出来的数据，可以看到数据名字格式，例如训练集中cat.0.jpg表示猫分类的第一张图，同理dog.0.jpg表示狗分类的第一张图，数字从0一直持续到12499，猫和狗各12500张，测试集中类似，只是没有猫狗的标记，图像只是以数字计算，一共有12500张测试图片。
#### 2、 建立分类目录

我们为数据文件建立symbol link并划分为训练集,测试集，为模型训练和特征提取打好基础。

In [None]:
import shutil
import os
from tqdm import tqdm

work_dir  = os.getcwd()
train_dir = work_dir + '/train/'
test_dir = work_dir + '/test/'
# 建立link根目录
imglink_dir = work_dir + "/img_link/"
if(os.path.exists(imglink_dir)):
    shutil.rmtree(imglink_dir)
os.mkdir(imglink_dir)
# 建立特征提取训练集目录
img_train2 = imglink_dir + "train2/"
os.mkdir(img_train2)
os.mkdir(img_train2 + "cat/")
os.mkdir(img_train2 + "dog/")
# 建立测试集目录
img_test  = imglink_dir + "test/"
os.mkdir(img_test)
os.mkdir(imglink_dir + "test/mixed/")
# 建立模型训练训练集 验证集目录，测试集目录共用一个
img_train = imglink_dir + "train/"
os.mkdir(img_train)
img_valid = imglink_dir + "valid/"
os.mkdir(img_valid)
img_train_cat = img_train + "cat/"
os.mkdir(img_train_cat)
img_train_dog = img_train + "dog/"
os.mkdir(img_train_dog)
img_valid_cat = img_valid + "cat/"
os.mkdir(img_valid_cat)
img_valid_dog = img_valid + "dog/"
os.mkdir(img_valid_dog)

In [None]:
#link test
test_filenames = os.listdir(test_dir)
num_test = len(test_filenames)

for i in tqdm(range(num_test)): 
    os.symlink(test_dir + test_filenames[i], img_test + "mixed/" + test_filenames[i])

#### 3、 数据研究
先来展示下咱们的训练数据

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
for i, file in enumerate(os.listdir(train_dir)[:8]):
    img = plt.imread(os.path.join(train_dir, file))
    plt.subplot(2, 4, i+1)
    plt.title(file.split('.')[0])
    plt.axis('on')
    plt.imshow(img)
plt.show()

从上面的图示的结果来看，可以看出每张图片并不像我们预想的分辨率都一样大，那么，让获取每张图片的分辨率以及通道数，来对分辨率进行统计分析。

In [None]:
import os
import cv2
from tqdm import tqdm

train_info = [[],[]]
outside = []
train_img_list = list(filter(lambda x:x[-3:] == 'jpg', os.listdir(train_dir)))
for img in tqdm(train_img_list):
    info = cv2.imread(os.path.join(train_dir, img))
    train_info[0].append(info.shape[0])
    train_info[1].append(info.shape[1])
    if(info.shape[0] > 600):
        outside.append(train_dir+img)

In [None]:
print(outside)

图示化分析图片的分辨率

In [None]:
plt.scatter(train_info[0], train_info[1])
plt.xlabel('Width')
plt.ylabel('Height') 
plt.xlim(0,max(train_info[0]) + 10)
plt.ylim(0,max(train_info[1]) + 10)  
plt.show()

从图表中可以看出图片的宽度和高度基本都遍布在100-500之间。

删除那两张超出平均分辨率很多的图片

In [None]:
for img in outside:
    os.remove(img)

#### 4、数据预处理
为了尽量利用我们有限的训练数据，我们将通过一系列随机变换堆数据进行提升，这样我们的模型将看不到任何两张完全相同的图片，这有利于我们抑制过拟合，使得模型的泛化能力更好。

这个步骤可以通过keras.preprocessing.image.ImageGenerator来实现，下面我们来展示下这个预处理函数实现后的结果。

In [None]:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import matplotlib.pyplot as plt
import os

temp = ImageDataGenerator(
                        rotation_range = 10,
                        zoom_range = 0.2,
                        width_shift_range = 0.05,
                        height_shift_range = 0.1,
                        channel_shift_range=10,
                        horizontal_flip=True)

img = load_img(train_dir + '/cat.0.jpg')  
x = img_to_array(img)  
x = x.reshape((1,) + x.shape)

if(os.path.exists("preview")):
    shutil.rmtree("preview")
os.mkdir("preview")
i = 0
for batch in temp.flow(x, batch_size=1,
                          save_to_dir='preview', save_prefix='cat', save_format='jpeg'):
    i += 1
    if i >= 8:
        break

plt.figure(figsize=(10, 8))
for i, file in enumerate(os.listdir("preview")):
    img = plt.imread(os.path.join("preview", file))
    plt.subplot(2, 4, i+1)
    plt.axis('on')
    plt.imshow(img)
plt.show()

接着要进行我们的数据清洗工作，因为我们的图片保不齐有一些图并不是我们的猫狗类别的图像，这样对我们的算法上可能会造成一些影响，所以要对不是猫狗的图进行一下清洗。清洗的方法参考[这个链接](https://zhuanlan.zhihu.com/p/34068451)，使用imagenet上top几来进行分类。我还找到了一个[马桶的例子](https://blog.csdn.net/lauyeed/article/details/78886830)，感觉也很实用。

首先把前辈整理好的猫狗种类偷下来整理成列表。

In [None]:
dogs_cats = [
 'n02085620','n02085782','n02085936','n02086079'
,'n02086240','n02086646','n02086910','n02087046'
,'n02087394','n02088094','n02088238','n02088364'
,'n02088466','n02088632','n02089078','n02089867'
,'n02089973','n02090379','n02090622','n02090721'
,'n02091032','n02091134','n02091244','n02091467'
,'n02091635','n02091831','n02092002','n02092339'
,'n02093256','n02093428','n02093647','n02093754'
,'n02093859','n02093991','n02094114','n02094258'
,'n02094433','n02095314','n02095570','n02095889'
,'n02096051','n02096177','n02096294','n02096437'
,'n02096585','n02097047','n02097130','n02097209'
,'n02097298','n02097474','n02097658','n02098105'
,'n02098286','n02098413','n02099267','n02099429'
,'n02099601','n02099712','n02099849','n02100236'
,'n02100583','n02100735','n02100877','n02101006'
,'n02101388','n02101556','n02102040','n02102177'
,'n02102318','n02102480','n02102973','n02104029'
,'n02104365','n02105056','n02105162','n02105251'
,'n02105412','n02105505','n02105641','n02105855'
,'n02106030','n02106166','n02106382','n02106550'
,'n02106662','n02107142','n02107312','n02107574'
,'n02107683','n02107908','n02108000','n02108089'
,'n02108422','n02108551','n02108915','n02109047'
,'n02109525','n02109961','n02110063','n02110185'
,'n02110341','n02110627','n02110806','n02110958'
,'n02111129','n02111277','n02111500','n02111889'
,'n02112018','n02112137','n02112350','n02112706'
,'n02113023','n02113186','n02113624','n02113712'
,'n02113799','n02113978','n02123045','n02123159'
,'n02123394','n02123597','n02124075','n02125311'
,'n02127052']

查找，显示，并删除异常数据。

In [None]:
from keras.applications.resnet50 import preprocess_input, decode_predictions 
import cv2
import matplotlib.pyplot as plt
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50
import numpy as np
def find_dirty_img(train_dir, dogs_cats,top = 10):
    dirty_imgs = []
    pathlist = os.listdir(train_dir)
    model = ResNet50(weights='imagenet')
    for i in pathlist:
        img = image.load_img(train_dir + i, target_size=(224, 224))  
        img = image.img_to_array(img)  
        x = np.expand_dims(img, axis=0)  
        x = preprocess_input(x)  
        preds = model.predict(x)
        preds = decode_predictions(preds, top=top)[0] 
        preds_list = list(zip(*preds))[0]
        if set(preds_list)&set(dogs_cats):
            continue
        else:
            dirty_imgs.append(train_dir + i)
            imgs = cv2.imread(train_dir + i)
            plt.imshow(imgs)
            plt.axis('on')
            plt.title(i)
            plt.show()
    return dirty_imgs

In [None]:
dirty_imgs = find_dirty_img(train_dir, dogs_cats, top = 35)

In [None]:
print(dirty_imgs, len(dirty_imgs))

可以看到共筛选出63个异常值，我们现在将他们从文件中删除。

In [None]:
for img in dirty_imgs:
    os.remove(img)

删除后的文件总数有24935个。

In [None]:
train_list = os.listdir(train_dir)
len(train_list)

在剔除脏文件后，我们对train的文件进行链接，用来做后面的算法实施。

In [None]:
# symbol link train2
train_filenames = os.listdir(train_dir)
train_cat = list(filter(lambda x:x[:3] == 'cat', train_filenames))
train_dog = list(filter(lambda x:x[:3] == 'dog', train_filenames))

for i in tqdm(range(len(train_cat))):
    os.symlink(train_dir + train_cat[i], img_train2 + "cat/" + train_cat[i])
for i in tqdm(range(len(train_dog))):
    os.symlink(train_dir + train_dog[i], img_train2 + "dog/" + train_dog[i])

In [None]:
#link valid train
for i in tqdm(range(len(train_cat))):
    if i < (len(train_cat) * 0.2):
        os.symlink(train_dir + train_cat[i], img_valid_cat + train_cat[i])
    else:
        os.symlink(train_dir + train_cat[i], img_train_cat + train_cat[i])
for i in tqdm(range(len(train_dog))):
    if i < (len(train_cat) * 0.2):
        os.symlink(train_dir + train_dog[i], img_valid_dog+ train_dog[i])
    else:
        os.symlink(train_dir + train_dog[i], img_train_dog + train_dog[i])

### 二、算法实施
我首先实现一个类，可以进行对一些基础模型进行迁移，和fine-tune的操作。为后面的模型训练提取等做准备

In [None]:
import numpy as np
from keras.layers import *
from keras.preprocessing.image import *
from keras.models import *
from keras.callbacks import *
import matplotlib.pyplot as plt
import pandas as pd

class CNN(object):
    np.random.seed(23)

    def __init__(self, model, train_dir, val_dir, test_dir, train_full_dir,
                 epochs, patience, batch_size=32,
                 img_sz=(224, 224), preprocess_func=None):
        self.train_dir = train_dir
        self.train_full_dir = train_full_dir
        self.val_dir = val_dir
        self.test_dir = test_dir
        self.epochs = epochs
        self.patience = patience
        self.img_sz = img_sz
        if preprocess_func:
            self.preprocess_input = preprocess_func

        train_generator = ImageDataGenerator(
            preprocessing_function=self.preprocess_input,
            rotation_range=10,
            width_shift_range=0.05,
            height_shift_range=0.05,
            shear_range=0.1,
            zoom_range=0.1,
            horizontal_flip=True)
        self.train_ge = train_generator.flow_from_directory(
            self.train_dir,
            target_size=self.img_sz,
            batch_size=batch_size,
            shuffle=True,
            class_mode='binary')

        val_generator = ImageDataGenerator()
        self.val_ge = val_generator.flow_from_directory(
            self.val_dir,
            target_size=self.img_sz,
            batch_size=batch_size,
            shuffle=False,
            class_mode='binary')

        self.test_ge = val_generator.flow_from_directory(
            self.test_dir,
            target_size=self.img_sz,
            batch_size=batch_size,
            shuffle=False,
            class_mode=None)

        input_tensor = Input((img_sz[0], img_sz[1], 3))
        self.base_model = model(input_tensor=input_tensor, weights='imagenet', include_top=False)
        self.model_name = self.base_model.name
        for layer in self.base_model.layers:
            layer.trainable = False

        x = self.base_model.output
        x = GlobalAveragePooling2D()(x)
        x = Dropout(0.5)(x)

        output = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001))(x)

        self.model = Model(inputs=self.base_model.input, outputs=output)
        self.model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

    def fit(self):
        check_point = ModelCheckpoint(
            self.model_name + '-freeze.hdf5',
            monitor='val_loss',
            mode='min',
            verbose=1,
            save_best_only=True,
            save_weights_only=True)

        self.model.fit_generator(
            self.train_ge,
            steps_per_epoch=650,
            epochs=self.epochs,
            validation_data=self.val_ge,
            validation_steps=150,
            callbacks=[check_point])

    def fit_finetune(self, fine_tune_layer, epochs = self.epochs):
        for layer in self.model.layers:
            layer.trainable = False
        for layer in self.model.layers[-fine_tune_layer:]:
            layer.trainable = True
        self.model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])
        check_point = ModelCheckpoint(
            self.model_name + 'finetune-freeze.hdf5',
            monitor='val_loss',
            mode='min',
            verbose=1,
            save_best_only=True,
            save_weights_only=True)

        self.model.fit_generator(
            self.train_ge,
            steps_per_epoch=1250,
            epochs=epochs,
            validation_data=self.val_ge,
            validation_steps=150,
            callbacks=[check_point])

    def summary(self, bestmode = False):
        self.model.summary()

    def predict(self, name):
        self.predict_ge = self.best_model.predict_generator(self.test_ge, verbose=1)
        self.predict_ge = self.predict_ge.clip(min=0.005, max=0.995)

        df = pd.read_csv("sample_submission.csv")
        for i, fname in enumerate(self.test_ge.filenames):
            index = int(fname.split('/')[1].split('.')[0])
            df.set_value(index - 1, 'label', self.predict_ge[i])

        outfile = 'pred_' + name + '.csv'
        print('Saving test result on: ' + outfile)
        df.to_csv(outfile, index=None)
        print("save_predict in: " + outfile)
        return df

    def write_feature(self, name = self.model_name):
        model = Model(self.model.input, self.model.layers[-3].output)
        print('The output of model: ', model.output)

        print('Data augmentation')
        gen = ImageDataGenerator(
            preprocessing_function=self.preprocess_input,
            rotation_range = 10,
            zoom_range = 0.1,
            width_shift_range = 0.05,
            height_shift_range = 0.05,
            channel_shift_range=10,
            shear_range=5,
            horizontal_flip=True,
        )

        train_gen = gen.flow_from_directory(
            self.train_full_dir,
            target_size=self.img_sz,
            shuffle=False,
            batch_size=128,
            class_mode=None,
        )

        self.train_gen_full = train_gen
        test_gen = self.test_ge

        print('feature from train data ...' )
        train = model.predict_generator(train_gen, verbose=1)
        print('feature from test data ...' )
        test = model.predict_generator(test_gen, verbose=1)

        fn = "feature_%s.h5"%name
        print('Write feature to file: ' + fn)
        with h5py.File(fn) as h:
            h.create_dataset("train", data=train)
            h.create_dataset("label", data=train_gen.classes)
            h.create_dataset("test", data=test)

### 单模型fine-tune 并提取特征向量
### Xception
全冻结模型进行训练

基于xception进行finetune，finetune最后28层。