In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import cv2
import h5py
import pathlib
import keras
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf

from keras.preprocessing import image
from keras.models import Model
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (cross_val_score, KFold, train_test_split)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
# define functions
def extract_image_features(image_directory, cnn_model):
    image_features = {}
    # collect all files from input directory
    count = 1
    for root, dirs, files in os.walk(image_directory):
        for name in files:
            if not name[-4:] == '.jpg': # skip non jpg files
                continue
            if count % 1000 == 0: 
                print('1k')
            img_path = os.path.join(root, name)
            img_id = name[:-4]
            img = image.load_img(img_path, target_size=(299, 299))
            img_data = image.img_to_array(img)
            img_data = np.expand_dims(img_data, axis=0)
            img_data = preprocess_input(img_data)

            cnn_feature = cnn_model.predict(img_data)

            image_features[img_id] = cnn_feature
            count += 1
    return image_features


def train_split(n_splits, test_size, training_triplets):
    
    collect_train_splits = []
    collect_test_splits = []
    
    random.seed(42)
    seeds = random.sample(range(100), n_splits)
    
    for i in range(n_splits):
        random.seed(seeds[i])

        # generate random image ids
        test_ids = random.sample(range(5000), 5)

        # select all triplets containing at least one of the image ids
        test_index = training_triplets.index[training_triplets.isin(test_ids).any(1)]

        # add one new id and collect all triplets containing it
        # continue until test set size is reached
        while len(test_index) <= test_size:
            test_index = training_triplets.index[training_triplets.isin(test_ids).any(1)]
            new_ids = np.unique(training_triplets.iloc[test_index].to_numpy()).tolist()

            n_test_ids = len(test_ids)
            while n_test_ids == len(test_ids):
                new_id = random.choice(new_ids)

                if new_id in test_ids:
                    continue
                else:
                    test_ids.append(new_id)

        # get training triplets as all triplets that don't contain any of the test ids
        all_test_ids = np.unique(training_triplets.iloc[test_index].to_numpy()).tolist()
        train_index = training_triplets.index[~training_triplets.isin(all_test_ids).any(1)]
        
        # sort and convert to list to read from hdf5 file
        train_index = np.concatenate((train_index, train_index+len(training_triplets)))
        test_index = np.concatenate((test_index, test_index+len(training_triplets)))
        train_index.sort()
        test_index.sort()
        train_index = list(train_index)
        test_index = list(test_index)
        
        # gather splits
        collect_train_splits.append(train_index)
        collect_test_splits.append(test_index)
        
        print('k-fold ', i)
        print('Number of training triplets:', len(train_index))
        print('Number of testing triplets:', len(test_index))
        
    return collect_train_splits, collect_test_splits

In [4]:
# load data
image_dir = './food'
train_triplets = pd.read_csv('train_triplets.txt', sep=' ', header=None)
test_triplets = pd.read_csv('test_triplets.txt', sep=' ', header=None)
train_triplets.columns = ['A', 'B', 'C']
test_triplets.columns = train_triplets.columns
print(train_triplets.shape)
print(test_triplets.shape)

(59515, 3)
(59544, 3)


In [5]:
# load pretrained inceptionv3 model for feature extraction
vision_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')

In [6]:
# testrun feature extraction
sample_img = './food/00000.jpg'
img = image.load_img(sample_img, target_size=(299, 299))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)

sample_feature = vision_model.predict(img_data)
print(sample_feature.shape)

(1, 2048)


In [None]:
# create image feature dictionary
inceptionv3_features_dict = extract_image_features(image_dir, vision_model)

# save image features
np.save('inceptionv3_features_dict.npy', inceptionv3_features_dict)

In [7]:
# load dictionary
image_features = np.load('inceptionv3_features_dict.npy', allow_pickle='TRUE').item()

In [8]:
n_images = len(image_features)

# read all image feature to array
raw_features = np.zeros(shape=(n_images, 2048), dtype='float32')
for i in range(n_images):
    raw_features[i] = image_features['{:0>5d}'.format(i)]

n_train_samples = 2 * train_triplets.shape[0]
n_test_samples = test_triplets.shape[0]
n_feature_maps = 3 * raw_features.shape[-1]
print('Number of all training samples: ', n_train_samples)
print('Number of submission samples: ', n_test_samples)
print('Number of triplet features: ', n_feature_maps)

Number of all training samples:  119030
Number of submission samples:  59544
Number of triplet features:  6144


In [None]:
# create training and submission data and save on disc
with h5py.File('task4_data.hdf5','w') as f:
    X = f.create_dataset('X', (n_train_samples, n_feature_maps), dtype='float32')
    X_submission = f.create_dataset('X_submission', (n_test_samples, n_feature_maps), dtype='float32')
    
    # create class 1 training samples
    for i in range(int(len(X)/2)):
        imgA_features = raw_features[train_triplets['A'][i]]
        imgB_features = raw_features[train_triplets['B'][i]]
        imgC_features = raw_features[train_triplets['C'][i]]
        features = np.concatenate((imgA_features, imgB_features, imgC_features), axis=-1)
        X[i] = features
        
    # create class 0 training samples
    for i in range(int(len(X)/2)):
        imgA_features = raw_features[train_triplets['A'][i]]
        imgB_features = raw_features[train_triplets['B'][i]]
        imgC_features = raw_features[train_triplets['C'][i]]
        features = np.concatenate((imgA_features, imgC_features, imgB_features), axis=-1)
        X[i+int(len(X)/2)] = features
        
    # create submission samples
    for i in range(len(X_submission)):
        imgA_features = raw_features[test_triplets['A'][i]]
        imgB_features = raw_features[test_triplets['B'][i]]
        imgC_features = raw_features[test_triplets['C'][i]]
        features = np.concatenate((imgA_features, imgB_features, imgC_features), axis=-1)
        X_submission[i] = features
        
    print(X.shape, X.dtype)
    print(X_submission.shape, X_submission.dtype)

In [9]:
# create labels
y = np.zeros(shape=(n_train_samples))
y[0:int(n_train_samples/2)] += 1
print(y.shape)

(119030,)


In [10]:
# clear tensorflow session
tf.keras.backend.clear_session()

In [11]:
# define classifier model
inputs = tf.keras.Input(shape=(n_feature_maps))
x = tf.keras.layers.Activation('relu')(inputs)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(1024)(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dense(256)(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dense(64)(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dense(8)(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dense(1)(x)
outputs = tf.keras.layers.Activation('sigmoid')(x)
model = tf.keras.Model(inputs, outputs)

In [12]:
# create model
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.save_weights('initial_weights.h5')

In [13]:
# define training variables
epochs = 11
batch_size = 32
kfolds = 2
validation_size = 2000
early = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='min')

In [14]:
# get training and validation split
X_train_inds, X_test_inds = train_split(kfolds, validation_size, train_triplets)

k-fold  0
Number of training triplets: 26494
Number of testing triplets: 4016
k-fold  1
Number of training triplets: 26728
Number of testing triplets: 4014


In [15]:
model.load_weights('initial_weights.h5')

# testrun model
with h5py.File('task4_data.hdf5','r') as f:
    loss0, accuracy0 = model.evaluate(x=f['X'][X_train_inds[0]], y=y[X_train_inds[0]],
                                      batch_size=batch_size,
                                      verbose=0)
    print('Initial loss: {:.2f}'.format(loss0))
    print('Initial accuracy: {:.2f}'.format(accuracy0))

Initial loss: 0.70
Initial accuracy: 0.50


In [17]:
# normal training with early stopping / cross validation if kfolds > 1
for i in range(kfolds):
    model.load_weights('initial_weights.h5')
    
    with h5py.File('task4_data.hdf5','r') as f:
        history = model.fit(x=f['X'][X_train_inds[i]], y=y[X_train_inds[i]],
                            batch_size=32,
                            epochs=epochs,
                            callbacks=[early],
                            validation_data=(f['X'][X_test_inds[i]], y[X_test_inds[i]]))

Train on 26494 samples, validate on 4016 samples
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 00009: early stopping
Train on 26728 samples, validate on 4014 samples
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 00006: early stopping


In [None]:
# load initial weights before training full model
model.load_weights('initial_weights.h5')

best_epochs = 7

# train full model
with h5py.File('task4_data.hdf5','r') as f:
    print('Training..')
    model.fit(x=f['X'][:], y=y, epochs=best_epochs, batch_size=32)
    
    # predict on submission file
    prediction = model.predict(f['X_submission'][:])
    print('..done')

In [None]:
# create submission file
prediction = np.where(prediction < 0.5, 0, 1)
np.savetxt('submission.txt', prediction, fmt='%d')