# Read me

1.运行前请修改路径

[1] './data/whale/train.csv' 是train.csv的路径

[2] original_image_path = "./data/whale/train_full/" 是原始的train image路径

[3] image_augment_path = './data/whale/siamese_augment/train_aug' 是增强后的train image存放路径

[4] train_csv_path = './data/whale/siamese_augment/train_aug.csv' 是增强后train image中用来作为模型训练的部分数据存放路径

[5] test_csv_path = './data/whale/siamese_augment/test_aug.csv' 是增强后train image中用来作为模型validation的部分数据存放路径

[6] file_path = os.path.join('./data/whale/siamese_augment', file_path) 用来存放模型的权重

[7] test_files = glob.glob("./data/whale/test/*.jpg") 是原始的test image的存放路径

[8] sub_csv_path = os.path.join('./data/whale/siamese_augment', sub_csv_path) 是最后的预测结果存放路径


2.可调参数

[1]batch_size = 8 根据GPU内存大小增加这个值

[2]num_epochs = 3 考虑增加到200以上，代码里增加了early stop和save best result模块，会根据训练结果的好坏提前终止


# data augment

In [1]:
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from keras import backend as K
from keras.models import Model
from keras.layers import Embedding, Flatten, Input, merge
from keras.optimizers import Adam
from keras.layers import Conv2D, MaxPooling2D, Input, Dense, Flatten, GlobalMaxPooling2D
import glob
import os
from PIL import Image
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization, \
    GlobalMaxPool2D, Concatenate, GlobalMaxPooling2D, GlobalAveragePooling2D, Lambda
from keras.applications.resnet50 import ResNet50
from sklearn.neighbors import NearestNeighbors  
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# split original data

In [2]:
seed = 1337
# Read data
data = pd.read_csv('./data/whale/train.csv')
train, test = train_test_split(data, test_size=0.3, shuffle=True, random_state=seed)
file_id_mapping_train = {k: v for k, v in zip(train.Image.values, train.Id.values)}
file_id_mapping_test = {k: v for k, v in zip(test.Image.values, test.Id.values)}

# data augmentation

In [3]:
original_image_path = "./data/whale/train_full/"
image_augment_path = './data/whale/siamese_augment/train_aug'
train_csv_path = './data/whale/siamese_augment/train_aug.csv'
test_csv_path = './data/whale/siamese_augment/test_aug.csv'


RESIZE_WIDTH, RESIZE_HEIGHT = 256, 256
CHANNEL = 1
input_shape = (RESIZE_WIDTH, RESIZE_HEIGHT)
argument_factor = 6

In [18]:
datagen_args = dict(rotation_range=10,
                    width_shift_range=0.15,
                    height_shift_range=0.15,
                    shear_range=0.15,
                    zoom_range=0.15,
                    horizontal_flip=False)

datagen = ImageDataGenerator(**datagen_args)
np.random.seed(seed)

def read_and_resize(filepath):
#   这里不是用的grayscale，而是转成RGB了
    im = Image.open((filepath)).convert('L')
    im = im.resize(input_shape)
#   im的shape变成（256， 256， 3）
    im_array = np.array(im, dtype="uint8")
#   转换成float类型
#     return np.array(im_array / (np.max(im_array)+ 0.001), dtype="float32")
    return im_array

def data_augment(file_id_mapping, save_image_path=None, save_csv_path=None):
    image_names = []
    id_names = []
#     i = 0
    for image_name, id_name in file_id_mapping.iteritems():
        image_prefix = image_name.split('.')[0]
        image_resized = read_and_resize(os.path.join(original_image_path, image_name))
        image_names.append(image_prefix + '_0.jpg')
        id_names.append(id_name)
        im = Image.fromarray(image_resized)
        im.convert('RGB').save(os.path.join(save_image_path, image_prefix + '_0.jpg'))
        
        for j in range(1, argument_factor):
            augmented = datagen.random_transform(image_resized)
            image_names.append(image_prefix + '_' + str(j) + '.jpg')
            id_names.append(id_name)
            im = Image.fromarray(augmented)
            im.convert('RGB').save(os.path.join(save_image_path, image_prefix + '_' + str(j) + '.jpg'))
        
#         i += 1
#         if i > 2:
#             break
    
    file_id_df = pd.DataFrame(data={'Image':image_names, 'Id':id_names})
    file_id_df.to_csv(save_csv_path, index=False)

In [19]:
# augment train images
data_augment(file_id_mapping_train, save_image_path=image_augment_path, save_csv_path=train_csv_path)
# augment test images
data_augment(file_id_mapping_test, save_image_path=image_augment_path, save_csv_path=test_csv_path)

# define batch generator & model

In [28]:
class sample_gen(object):
    def __init__(self, file_class_mapping, other_class = "new_whale"):
        self.file_class_mapping= file_class_mapping
        self.class_to_list_files = defaultdict(list)
        self.list_other_class = []
        self.list_all_files = list(file_class_mapping.keys())
        self.range_all_files = list(range(len(self.list_all_files)))

        for file, class_ in file_class_mapping.items():
            if class_ == other_class:
                self.list_other_class.append(file)
            else:
                self.class_to_list_files[class_].append(file)

#       注意这里的class id有重复
        self.list_classes = list(set(self.file_class_mapping.values()))
        self.range_list_classes= range(len(self.list_classes))
#       每个class（Id）的比重，相当于直方图  
        self.class_weight = np.array([len(self.class_to_list_files[class_]) for class_ in self.list_classes]) * 1.0
#         self.class_weight = self.class_weight/np.sum(self.class_weight)
        
        self.class_weight /= self.class_weight.sum()
        print "sum=", self.class_weight.sum()

#   这个函数只是返回一个triplet样例
    def get_sample(self):
#       按class id比重抽取一个样本
        class_idx = np.random.choice(self.range_list_classes, 1, p=self.class_weight)[0]
#       对这种class id的，抽取两个样本images (如果某个class只有一个样本，那么返回的是两个一样的image)
        examples_class_idx = np.random.choice(range(len(self.class_to_list_files[self.list_classes[class_idx]])), 2)
#       注意这两个样本属于同一个class
        positive_example_1, positive_example_2 = \
            self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[0]],\
            self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[1]]

#       提取一个跟positive_example_1不同class的样本
        negative_example = None
        while negative_example is None or self.file_class_mapping[negative_example] == \
                self.file_class_mapping[positive_example_1]:
            negative_example_idx = np.random.choice(self.range_all_files, 1)[0]
            negative_example = self.list_all_files[negative_example_idx]
        return positive_example_1, negative_example, positive_example_2





# input_shape = (256, 256)
base_path = image_augment_path

# 就是返回了y_pred的平均值
def identity_loss(y_true, y_pred):

    return K.mean(y_pred - 0 * y_true)

# Bayesian Personalized Ranking loss
def bpr_triplet_loss(X):

    positive_item_latent, negative_item_latent, user_latent = X

    # BPR loss
    loss = 1.0 - K.sigmoid(
        K.sum(user_latent * positive_item_latent, axis=-1, keepdims=True) -
        K.sum(user_latent * negative_item_latent, axis=-1, keepdims=True))

    return loss

def get_base_model():
    latent_dim = 50
#   include_top：whether to include the fully-connected layer at the top of the network.
#   输入是grayscale , 256, 256 ,1, 因为include_top是false，这里必须显示指明input_shape
    base_model = ResNet50(weights = None,include_top=False, input_shape=input_shape+(CHANNEL,)) # use weights='imagenet' locally

    # for layer in base_model.layers:
    #     layer.trainable = False

    x = base_model.output
    x = GlobalMaxPooling2D()(x)
    x = Dropout(0.5)(x)
    dense_1 = Dense(latent_dim)(x)
    normalized = Lambda(lambda  x: K.l2_normalize(x,axis=1))(dense_1)
#   相当于对这50长度的vector，每个元素取平方，方便后面的距离计算
    base_model = Model(base_model.input, normalized, name="base_model")
    return base_model

def build_model():
    base_model = get_base_model()
#   input结构变成(256, 256, 3)
    positive_example_1 = Input(input_shape+(CHANNEL,) , name='positive_example_1')
    negative_example = Input(input_shape+(CHANNEL,), name='negative_example')
    positive_example_2 = Input(input_shape+(CHANNEL,), name='positive_example_2')

    positive_example_1_out = base_model(positive_example_1)
    negative_example_out = base_model(negative_example)
    positive_example_2_out = base_model(positive_example_2)

#   用triplet loss的方式对三个embedding进行merge,输出是一个sigmoid
    loss = merge(
        [positive_example_1_out, negative_example_out, positive_example_2_out],
        mode=bpr_triplet_loss,
        name='loss',
        output_shape=(1, ))

    model = Model(
        input=[positive_example_1, negative_example, positive_example_2],
        output=loss)
    model.compile(loss=identity_loss, optimizer=Adam(0.000001))

    print(model.summary())

    return model


model_name = "triplet_model"

file_path = model_name + "weights.best.hdf5"
file_path = os.path.join('./data/whale/siamese_augment', file_path)



def build_inference_model(weight_path=file_path):
    base_model = get_base_model()

    positive_example_1 = Input(input_shape+(CHANNEL,) , name='positive_example_1')
    negative_example = Input(input_shape+(CHANNEL,), name='negative_example')
    positive_example_2 = Input(input_shape+(CHANNEL,), name='positive_example_2')

    positive_example_1_out = base_model(positive_example_1)
    negative_example_out = base_model(negative_example)
    positive_example_2_out = base_model(positive_example_2)

    loss = merge(
        [positive_example_1_out, negative_example_out, positive_example_2_out],
        mode=bpr_triplet_loss,
        name='loss',
        output_shape=(1, ))

    model = Model(
        input=[positive_example_1, negative_example, positive_example_2],
        output=loss)
    model.compile(loss=identity_loss, optimizer=Adam(0.000001))

#   导入前面训练出来的权重
    model.load_weights(weight_path)

#   base model只包含了把input转为embedding的过程，没有包含后面的triplet loss部分
    inference_model = Model(base_model.get_input_at(0), output=base_model.get_output_at(0))
    inference_model.compile(loss="mse", optimizer=Adam(0.000001))
    print(inference_model.summary())

    return inference_model

def read_and_normalize(filepath):
#   这里不是用的grayscale，而是转成RGB了
    im = Image.open((filepath))
#     im = im.resize(input_shape)
#   im的shape变成（256， 256， 3）
    im_array = np.array(im, dtype="uint8")
    im_array = im_array.reshape(input_shape+(CHANNEL,))
#   转换成float类型
    return np.array(im_array / (np.max(im_array)+ 0.001), dtype="float32")

# # 进行小概率的augment
# def augment(im_array):
#     if np.random.uniform(0, 1) > 0.9:
# #       fliplr只对第1维度column进行flip
#         im_array = np.fliplr(im_array)
#     return im_array

# 这个函数返回一个generator
batch_size = 8

def gen(triplet_gen):
    while True:
        list_positive_examples_1 = []
        list_negative_examples = []
        list_positive_examples_2 = []

#       会有重复抽样
        for i in range(batch_size):
            positive_example_1, negative_example, positive_example_2 = triplet_gen.get_sample()
            positive_example_1_img, negative_example_img, positive_example_2_img = read_and_normalize(os.path.join(base_path,positive_example_1)), \
                                                                       read_and_normalize(os.path.join(base_path, negative_example)), \
                                                                       read_and_normalize(os.path.join(base_path, positive_example_2))


            list_positive_examples_1.append(positive_example_1_img)
            list_negative_examples.append(negative_example_img)
            list_positive_examples_2.append(positive_example_2_img)

        list_positive_examples_1 = np.array(list_positive_examples_1)
        list_negative_examples = np.array(list_negative_examples)
        list_positive_examples_2 = np.array(list_positive_examples_2)
        
#       利用yield，返回一个generator, 并且call on the fly (通过yield + while True)，节省内存
#       注意配合model.fit_generator使用的generator返回值必须是（input, target），所以后面的np.ones(batch_size)相当于target (即label)
#       只不过在这个模型里面这个target没有被用上而已
#       最后注意每次yield返回一个batch的samples
        yield [list_positive_examples_1, list_negative_examples, list_positive_examples_2], np.ones(batch_size)

In [29]:
num_epochs = 3

# Read data
train = pd.read_csv(train_csv_path)
test = pd.read_csv(test_csv_path)
# train, test = train_test_split(data, test_size=0.3, shuffle=True, random_state=1337)
#把image作为key，id作为value
file_id_mapping_train = {k: v for k, v in zip(train.Image.values, train.Id.values)}
file_id_mapping_test = {k: v for k, v in zip(test.Image.values, test.Id.values)}
train_gen = sample_gen(file_id_mapping_train)
test_gen = sample_gen(file_id_mapping_test)

sum= 1.0000000000000002
sum= 1.0


In [30]:
# Prepare the test triplets

model = build_model()



#model.load_weights(file_path)

# 根据monitor的值即loss，保存loss最小(min)时的model (best model)
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=5)

callbacks_list = [checkpoint, early]  # early

# Trains the model on data generated batch-by-batch by a Python generator
# 这种模式，generate bath on the fly，可以节省很多memory，因而可以使用更大的batch size
history = model.fit_generator(gen(train_gen), validation_data=gen(test_gen), epochs=num_epochs, verbose=2, workers=1, use_multiprocessing=False,
                              callbacks=callbacks_list, steps_per_epoch=1000, validation_steps=100)
                              



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
positive_example_1 (InputLayer) (None, 256, 256, 1)  0                                            
__________________________________________________________________________________________________
negative_example (InputLayer)   (None, 256, 256, 1)  0                                            
__________________________________________________________________________________________________
positive_example_2 (InputLayer) (None, 256, 256, 1)  0                                            
__________________________________________________________________________________________________
base_model (Model)              (None, 50)           23683890    positive_example_1[0][0]         
                                                                 negative_example[0][0]           
          

In [22]:
# imm = read_and_normalize('./data/whale/siamese_augment/train_aug/11da3702_2.jpg')
# print imm.shape

(256, 256, 1)


In [34]:
# def read_resize_normalize(filepath):
# #   这里不是用的grayscale，而是转成RGB了
#     im = Image.open((filepath)).convert('L')
# #     im = im.resize(input_shape)
# #   im的shape变成（256， 256， 3）
#     im_array = np.array(im, dtype="uint8")
#     print im_array.shape
# #     im_array = im_array.reshape(input_shape+(CHANNEL,))
# #   转换成float类型
#     return np.array(im_array / (np.max(im_array)+ 0.001), dtype="float32")

# imm = read_resize_normalize('./data/whale/train_full/11da3702.jpg')
# print imm.shape

(435, 1010)


In [35]:
model_name = "triplet_loss"

def read_resize_normalize(filepath):
#   这里不是用的grayscale，而是转成RGB了
    im = Image.open((filepath)).convert('L')
    im = im.resize(input_shape)
#   im的shape变成（256， 256， 3）
    im_array = np.array(im, dtype="uint8")
    im_array = im_array.reshape(input_shape+(CHANNEL,))
#   转换成float类型
    return np.array(im_array / (np.max(im_array)+ 0.001), dtype="float32")


def data_generator(fpaths, batch=16):
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        img = read_resize_normalize(path)
        imgs.append(img)
#       获取image的名字
        fnames.append(os.path.basename(path))
        if i == batch:
            i = 0
            imgs = np.array(imgs)
#           每次yield返回一个batch的samples
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        yield fnames, imgs
    raise StopIteration()

data = pd.read_csv('./data/whale/train.csv')

file_id_mapping = {k: v for k, v in zip(data.Image.values, data.Id.values)}

inference_model = build_inference_model()

# 文件名匹配，返回一个list包含所有这个后缀的文件path
train_files = glob.glob("./data/whale/train_full/*.jpg")
test_files = glob.glob("./data/whale/test/*.jpg")

train_preds = []
train_file_names = []
i = 1
# 每个imgs里面包含的是一个batch的samples
for fnames, imgs in data_generator(train_files, batch=32):
#     print(i*32/len(train_files)*100)
    i += 1
    predicts = inference_model.predict(imgs)
#   将一个batch的images转换成embeddings，然后转成list
    predicts = predicts.tolist()
    train_preds += predicts
    train_file_names += fnames

#  得到了所有train images的embeddings
train_preds = np.array(train_preds)

test_preds = []
test_file_names = []
i = 1
for fnames, imgs in data_generator(test_files, batch=32):
#     print(i * 32 / len(test_files) * 100)
    i += 1
    predicts = inference_model.predict(imgs)
    predicts = predicts.tolist()
    test_preds += predicts
    test_file_names += fnames

#  得到了所有test images的embeddings
test_preds = np.array(test_preds)




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 256, 256, 1)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 262, 262, 1)  0           input_8[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 128, 128, 64) 3200        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 128, 128, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [36]:
# 这里用欧式距离判断class id，并且选取了6个neighbors
neigh = NearestNeighbors(n_neighbors=6)
neigh.fit(train_preds)
#distances, neighbors = neigh.kneighbors(train_preds)

#print(distances, neighbors)

# 对每个test样本，返回最近的六个embeddings,注意neighbors_test是train_preds里面样本的Index，而非样本本身
distances_test, neighbors_test = neigh.kneighbors(test_preds)

distances_test, neighbors_test = distances_test.tolist(), neighbors_test.tolist()

preds_str = []

for filepath, distance, neighbour_ in zip(test_file_names, distances_test, neighbors_test):
    sample_result = []
    sample_classes = []
    for d, n in zip(distance, neighbour_):
        train_file = train_files[n].split(os.sep)[-1]
        class_train = file_id_mapping[train_file]
        sample_classes.append(class_train)
        sample_result.append((class_train, d))

    if "new_whale" not in sample_classes:
        sample_result.append(("new_whale", 0.1))#new_whale有大概率出现，距离设置为0.1
    sample_result.sort(key=lambda x: x[1])
    sample_result = sample_result[:5] #取前五个距离最小的预测值
    preds_str.append(" ".join([x[0] for x in sample_result]))

df = pd.DataFrame(preds_str, columns=["Id"])
df['Image'] = [x.split(os.sep)[-1] for x in test_file_names]
sub_csv_path = "sub_%s.csv"%model_name
sub_csv_path = os.path.join('./data/whale/siamese_augment', sub_csv_path)
df.to_csv(sub_csv_path, index=False)