In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd

import cv2
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score

from keras.models import Model
#from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg19 import VGG19
from keras.layers import Dense, Input, Dropout, GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [None]:
#input_size = 128
#input_channels = 3
epochs = 50
batch_size = 128
n_folds = 5
training = True
ensemble_voting = False  # If True, use voting for model ensemble, otherwise use averaging

In [None]:
# Define CNN Model Architecture
img_height = 224
img_width = 224
img_channels = 3
img_dim = (img_height, img_width, img_channels)

def Vgg19(img_dim=img_dim):
    input_tensor = Input(shape=img_dim)
    base_model = VGG19(include_top=False,
                       weights='imagenet',
                       input_shape=(img_height, img_width, img_channels))
    bn = BatchNormalization()(input_tensor)
    x = base_model(bn)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)
    model = Model(input_tensor, output)
    
    return model

model = Vgg19()
model.summary()

In [None]:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=1)

fold_count = 0

y_full_test = []
thres_sum = np.zeros(2, np.float32)

In [None]:
for train_index, test_index in kf.split(X_train):

    fold_count += 1
    print('Fold ', fold_count)


    def transformations(src, choice):
        if choice == 0:
            # Rotate 90
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_CLOCKWISE)
        if choice == 1:
            # Rotate 90 and flip horizontally
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_CLOCKWISE)
            src = cv2.flip(src, flipCode=1)
        if choice == 2:
            # Rotate 180
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_180)
        if choice == 3:
            # Rotate 180 and flip horizontally
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_180)
            src = cv2.flip(src, flipCode=1)
        if choice == 4:
            # Rotate 90 counter-clockwise
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_COUNTERCLOCKWISE)
        if choice == 5:
            # Rotate 90 counter-clockwise and flip horizontally
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_COUNTERCLOCKWISE)
            src = cv2.flip(src, flipCode=1)
        return src
    
    df_train = X_train[train_index]
    y_train = target_train[train_index]
    print(df_train[:3])
    if training:
        print('Training on {} samples'.format(len(df_train)))
        print('Training target on {} samples'.format(len(y_train)))



    def train_generator():
        while True:
            for start in range(0, len(df_train), batch_size):
                x_batch = []
                end = min(start + batch_size, len(df_train))
                y_batch = y_train[start:end]
                for img in df_train[start:end]:
                    #img = cv2.imread('input/train-jpg/{}.jpg'.format(f))
                    new_img = cv2.resize(img, img_size)
                    new_img = transformations(new_img, np.random.randint(6))
                    x_batch.append(new_img)                    
                x_batch = np.array(x_batch, np.float32)/ 255.
                y_batch = np.array(y_batch, np.uint8)
                yield x_batch, y_batch


    df_valid = X_train[test_index]
    y_valid = target_train[test_index]
    print(df_valid[:3])
    print('Validating on {} samples'.format(len(df_valid)))


    def valid_generator():
        while True:
            for start in range(0, len(df_valid), batch_size):
                x_batch = []
                end = min(start + batch_size, len(df_valid))
                y_batch = y_valid[start:end]
                for img in df_valid[start:end]:
                    new_img = cv2.resize(img, img_size)
                    x_batch.append(new_img)
                x_batch = np.array(x_batch, np.float32)
                y_batch = np.array(y_batch, np.uint8)
                yield x_batch, y_batch

    def test_generator(transformation):
        while True:
            for start in range(0, len(test), batch_size):
                x_batch = []
                end = min(start + batch_size, len(test))
                for img in test[start:end]:
                    new_img = cv2.resize(img, img_size)
                    new_img = transformations(img, transformation)
                    x_batch.append(new_img)
                x_batch = np.array(x_batch, np.float32)
                yield x_batch

    callbacks = [EarlyStopping(monitor='val_loss',
                               patience=4,
                               verbose=1,
                               min_delta=1e-4),
                 ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.1,
                                   patience=2,
                                   cooldown=2,
                                   verbose=1),
                 ModelCheckpoint(filepath='best_weights.fold_' + str(fold_count) + '.hdf5',
                                 save_best_only=True,
                                 save_weights_only=True)]

    model = model

    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=1e-4),
                  metrics=['accuracy'])
    
    train_steps = len(df_train) / batch_size
    valid_steps = len(df_valid) / batch_size
    test_steps = len(test) / n_fold

    if training:
        model.fit_generator(generator=train_generator(),
                            steps_per_epoch=train_steps,
                            epochs=epochs,
                            verbose=2,
                            callbacks=callbacks,
                            validation_data=valid_generator(),
                            validation_steps= valid_steps)


    def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
        def mf(x):
            p2 = np.zeros_like(p)
            for i in range(2):
                p2[:, i] = (p[:, i] > x[i]).astype(np.int)
            score = fbeta_score(y, p2, beta=2, average='samples')
            return score

        x = [0.2] * 17
        for i in range(17):
            best_i2 = 0
            best_score = 0
            for i2 in range(resolution):
                i2 /= float(resolution)
                x[i] = i2
                score = mf(x)
                if score > best_score:
                    best_i2 = i2
                    best_score = score
            x[i] = best_i2
            if verbose:
                print(i, best_i2, best_score)
        return x


    # Load best weights
    model.load_weights(filepath='best_weights.fold_' + str(fold_count) + '.hdf5')

    p_valid = model.predict_generator(generator=valid_generator(),
                                      steps= valid_steps)

    y_valid = []
    for f, tags in df_valid.values:
        targets = np.zeros(2)
        for t in tags.split(''):
            targets[target_train[t]] = 1
        y_valid.append(targets)
    y_valid = np.array(y_valid, np.uint8)

    # Find optimal f2 thresholds for local validation set
    thres = optimise_f2_thresholds(y_valid, p_valid, verbose=False)

    print('F2 = {}'.format(fbeta_score(y_valid, np.array(p_valid) > thres, beta=2, average='samples')))

    thres_sum += np.array(thres, np.float32)


    
    # 6-fold TTA
    p_full_test = []
    for i in range(6):
        p_test = model.predict_generator(generator=test_generator(transformation=i),
                                         steps= test_steps)
        p_full_test.append(p_test)

    p_test = np.array(p_full_test[0])
    for i in range(1, 6):
        p_test += np.array(p_full_test[i])
    p_test /= 6

    y_full_test.append(p_test)

result = np.array(y_full_test[0])
if ensemble_voting:
    for f in range(len(y_full_test[0])):  # For each file
        for tag in range(17):  # For each tag
            preds = []
            for fold in range(n_folds):  # For each fold
                preds.append(y_full_test[fold][f][tag])
            pred = Counter(preds).most_common(1)[0][0]  # Most common tag prediction among folds
            result[f][tag] = pred
else:
    for fold in range(1, n_folds):
        result += np.array(y_full_test[fold])
    result /= n_folds
result = pd.DataFrame(result, columns=labels)

preds = []
thres = (thres_sum / n_folds).tolist()

for i in result.shape[0]:
    a = result.ix[[i]]
    a = a.apply(lambda x: x > thres, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))