In [1]:
#  导入必要包
%matplotlib inline
#import utilities
import os
import shutil
import numpy as np
import random
from tqdm import tqdm  
from time import time
from PIL import Image
import h5py
import pandas as pd

from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
from keras.callbacks import *
from keras.optimizers import *
from keras.utils import *
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# 建立符号链接
def data_symbol_link():
    work_dir  = os.getcwd()
    train_dir = work_dir + "/train/"
    test_dir  = work_dir + "/test/"
    data_dir  = work_dir + "/data/"
    
    if(os.path.exists(data_dir)):
        shutil.rmtree(data_dir)
        
    split_train_dir = work_dir+"/data/train"
    split_test_dir  = work_dir+"/data/test"
    os.mkdir(data_dir)
    
    os.mkdir(split_train_dir)
    os.mkdir(split_train_dir+"/dog")
    os.mkdir(split_train_dir+"/cat")
    os.mkdir(split_test_dir)
    os.mkdir(split_test_dir+"/test")
        
    train_files = os.listdir(train_dir)    
    num_train_files = len(train_files)
    for i in tqdm(range(num_train_files)):
        file = train_files[i]
        if "dog" in file.split('.'):
            os.symlink(train_dir+file, split_train_dir+"/dog/"+file)
        else:
            os.symlink(train_dir+file, split_train_dir+"/cat/"+file)
    
    test_files = os.listdir(test_dir)    
    num_test_files = len(test_files)
    for i in tqdm(range(num_test_files)):
        file = test_files[i]
        os.symlink(test_dir+file, split_test_dir+"/test/"+file)
        
    return split_train_dir, split_test_dir    

In [3]:
train_data, test_data = data_symbol_link()

100%|██████████| 25000/25000 [00:00<00:00, 84883.43it/s]
100%|██████████| 12500/12500 [00:00<00:00, 92137.63it/s]


In [4]:
# 融合模型
def write_feature_data(MODEL, image_shape, train_data, test_data, batch_size, preprocess_input = None):
    input_tensor = Input((image_shape[0], image_shape[1], 3))
    x = input_tensor
    if preprocess_input:
        x = Lambda(preprocess_input)(x)
    
    base_model = MODEL(input_tensor=x, weights='imagenet', include_top=False)
    base_model.save_weights(f'{base_model.name}-imagenet.h5')
    
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))

    gen = ImageDataGenerator()
    train_generator = gen.flow_from_directory(train_data, image_shape, shuffle=False, 
                                              batch_size=batch_size)
    test_generator = gen.flow_from_directory(test_data, image_shape, shuffle=False, 
                                             batch_size=batch_size, class_mode=None)
    train_feature = model.predict_generator(train_generator, train_generator.samples, verbose=1)
    test_feature = model.predict_generator(test_generator, test_generator.samples, verbose=1)
    with h5py.File(f"feature_{base_model.name}.h5") as h:
        h.create_dataset("train", data=train_feature)
        h.create_dataset("test", data=test_feature)
        h.create_dataset("label", data=train_generator.classes)

In [5]:
# Xception
write_feature_data(Xception, (299, 299), train_data, test_data, batch_size=1, preprocess_input=xception.preprocess_input)

Found 25000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


In [6]:
# Densenet
write_feature_data(DenseNet201, (224, 224), train_data, test_data, batch_size=1, preprocess_input=densenet.preprocess_input)

Found 25000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


In [7]:
# InceptionV3
write_feature_data(InceptionV3, (299, 299), train_data, test_data, batch_size=1, preprocess_input=inception_v3.preprocess_input)

Found 25000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


In [16]:
len(X_train), len(X_test), len(Y_train)

(125000, 62500, 25000)

In [4]:
# 构建网络
from sklearn.utils import shuffle
np.random.seed(42)

X_train = []
X_test = []

for filename in ["feature_xception.h5", "feature_densenet201.h5", "feature_inception_v3.h5"]:
    with h5py.File(filename, 'r') as h:
        X_train.append(np.array(h['train']))
        X_test.append(np.array(h['test']))
        Y_train = np.array(h['label'])

X_train = np.concatenate(X_train, axis=1)
X_test = np.concatenate(X_test, axis=1)

X_train, Y_train = shuffle(X_train, Y_train)

In [5]:
# 建立顶层网络结构
input_tensor = Input(X_train.shape[1:])
x = Dropout(0.5)(input_tensor)
x = Dense(1, activation='sigmoid')(x)
model = Model(input_tensor, x)

model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [6]:
plot_model(model,show_shapes=True)

In [7]:
#训练模型并保存顶层网络参数
filepath="merged_weight.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train, Y_train, batch_size=128, epochs=20, validation_split=0.2, shuffle=True,
         callbacks=callbacks_list)
model.save_weights("merged_weights.h5")

Train on 20000 samples, validate on 5000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.01692, saving model to merged_weight.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.01692 to 0.01376, saving model to merged_weight.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.01376
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.01376
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.01376
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.01376
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.01376
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.01376
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.01376
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.01376
Epoch 11/20

Epoch 00011: val_loss did not improve from 0.01376
Epoch 12/20

Epoch 00012: val_loss did not improve from 0.01376
Epoch 13/20

Epoch 00013: val_loss did not improve from 0.01376
Epoch 14/20

Epoch 00014: val_loss did not improve from 

In [8]:
# 在测试集预测
model.load_weights('merged_weights.h5')
y_test = model.predict(X_test, verbose=1)
y_test = y_test.clip(min=0.005, max=0.995)



In [9]:
df = pd.read_csv("sample_submission.csv")

gen = ImageDataGenerator()
test_generator = gen.flow_from_directory(test_data, (224, 224), shuffle=False, 
                                         batch_size=16, class_mode=None)

for i, fname in enumerate(test_generator.filenames):
    index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
    df.set_value(index-1, 'label', y_test[i])

df.to_csv('submission_merged.csv', index=None)
df.head(10)

Found 12500 images belonging to 1 classes.


  if __name__ == '__main__':


Unnamed: 0,id,label
0,1,0.995
1,2,0.995
2,3,0.995
3,4,0.995
4,5,0.005
5,6,0.005
6,7,0.005
7,8,0.005
8,9,0.005
9,10,0.005
