## keras_merge_3_app
整体说明：
- 1、使用keras的Xception、ResNet50、InceptonResNetV2预训练模型分别提取特征向量
- 2、整合3个模型的特征向量
- 3、构建一个简单模型，进行训练、预测



In [None]:
'''
数据目录结构：
data/
    train/    #原始数据，train.zip解压后生成
        dog.0.jpg
        cat.0.jpg
        ...
    train2/   #按标签分目录后的数据（连接文件）
        dog/
            dog.0.jpg
            dog.1.jpg
            ...
        cat/
            cat.0.jpg
            cat.1.jpg
            ...
    train3/   #去除异常图片后的训练数据（连接文件）
        dog/    #9983张图片
            dog.0.jpg
            dog.1.jpg
            ...
        cat/    #9961张图片
            cat.0.jpg
            cat.1.jpg
            ...
    validation/  #去除异常图片后的验证数据（连接文件）
        dog/   #2496张图片
            dog001.jpg
            dog002.jpg
            ...
        cat/   #2490张图片
            cat001.jpg
            cat002.jpg
            ...
    test/     
        test/  #测试集数据，12500张图片
            1.jpg
            2.jpg
            ...
'''
import cv2
import time
import pandas as pd
from tqdm import tqdm   #进度条
from PIL import Image
from helper import *

from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
from keras.callbacks import *
from keras.optimizers import *
from keras.utils import *
from sklearn.utils import shuffle


train_data_dir = 'data/train2'
test_data_dir='data/test'


batch_size = 72   #19944/72=277  4986/72=69.25
epochs=20
VER=1
#模型权重文件
model_h5file_base="Merge-tuning-v{}.h5".format(VER)

#预测结果文件
pred_file_base="pred-Merge-tuning-v{}.csv".format(VER)


In [None]:
#保存特征向量
def write_feature_data(MODEL, image_shape, weights_file, batch_size, preprocess_input = None):
    input_tensor = Input((image_shape[0], image_shape[1], 3))
    x = input_tensor
    if preprocess_input:
        x = Lambda(preprocess_input)(x)
    
    base_model = MODEL(input_tensor=x, weights='None', include_top=False) 
    base_model.load_weights(weights_file)  #用自己训练的权重文件
    
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))

    gen = ImageDataGenerator()
    train_generator = gen.flow_from_directory(train_data_dir, image_shape, shuffle=False, 
                                              batch_size=batch_size)
    test_generator = gen.flow_from_directory(test_data_dir, image_shape, shuffle=False, 
                                             batch_size=batch_size, class_mode=None)
    print(train_generator.samples)
    print(test_generator.samples)
    
    train_feature = model.predict_generator(train_generator, train_generator.samples, verbose=1)
    test_feature = model.predict_generator(test_generator, test_generator.samples, verbose=1)
    
    with h5py.File("feature_%s.h5"%base_model.name) as h:
        h.create_dataset("train", data=train_feature)
        h.create_dataset("test", data=test_feature)
        h.create_dataset("label", data=train_generator.classes)
        
#用三个模型fine-tuning时，预测结果最好的权重文件提取特征向量
write_feature_data(ResNet50, (224, 224), "ResNet50-fine-tuning-3-v1.h5", batch_size=batch_size, resnet50.preprocess_input)
write_feature_data(Xception, (299, 299), "xception-fine-tuning-1.h5", batch_size=batch_size, xception.preprocess_input)
write_feature_data(InceptionResNetV2, (299, 299), "InceptonResNetV2-fine-tuning-2-v2.h5", batch_size=batch_size, inception_resnet_v2.preprocess_input)


In [None]:

#从文件中读取特征向量和标签

np.random.seed(2018)

X_train = []
X_test = []

for filename in ["feature_ResNet50.h5", "feature_Xception.h5", "feature_InceptionResNetV2.h5"]:
    with h5py.File(filename, 'r') as h:
        X_train.append(np.array(h['train']))
        X_test.append(np.array(h['test']))
        y_train = np.array(h['label'])

X_train = np.concatenate(X_train, axis=1)
X_test = np.concatenate(X_test, axis=1)

X_train, y_train = shuffle(X_train, y_train)


In [None]:
#构造模型
start = time.clock()

input_tensor = Input(X_train.shape[1:])

x = Dropout(0.5)(input_tensor)
x = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001))(x)
model = Model(input_tensor, x)

#adam = optimizers.Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
#model.compile(optimizer=adam,
model.compile(optimizer='adadelta',
             loss='binary_crossentropy',
             metrics=['accuracy'])

print("Load base model used time:", (time.clock() - start))


In [None]:
model.summary()

In [None]:
#训练模型并保存在验证集上损失函数最小的权重
checkpoint = ModelCheckpoint(model_h5file_base, monitor='val_loss', verbose=1, save_best_only=True, mode='min',save_weights_only=True)
stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min') #如发现loss相比上一个epoch训练没有下降,则经过3个epoch后停止训练
callbacks_list = [stopping,checkpoint]

history=model.fit(X_train, Y_train, batch_size=128, epochs=epochs, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

In [None]:
#可视化学习曲线
show_learning_curve(history)

In [None]:
#预测结果   
predict_on_model(X_test, model, model_h5file_base, pred_file_base)