# Goal of this Kaggle competition

- Link of Kaggle competition: https://www.kaggle.com/c/imaterialist-challenge-fashion-2018
- As shoppers move online, it would be a dream come true to have products in photos classified automatically
- Although different fine-grained categories may look very similar (royal blue vs turquoise in color), it would be ideal to build a model able to perceive such subtle differences between photos, as these differences could be important for shopping decisions
- This Kaggle competition challenges us to accurately assign attribute labels for fashion images
- There are 228 distinct labels and our training dataset contains 1 million images

# Modeling approach

1. Download raw images from given URLs and resize them into a 112*112 RGB pixel format to build datasets
2. Build two base CNN (convolutional neural networks) models: one Xception architecture and one VGG16 model outputting labels probabilities
3. Build a meta learner neural network model taking as inputs the two base learners outputs and outputting the labels probabilities 
4. Measure F1 score performance of meta-learner on validation dataset

# Result

- Ranked 10th out of 212 teams (top 5%) at competition closure time (username “yasserez05” in the final private leaderboard)
- F1 score of 0.65 on test dataset (highest score of competition is 0.71)
- 3 Keras models (total size of 170MB) fully trained and usable for predicting labels of new images


# Import needed packages

In [56]:
import numpy as np
import pandas as pd

import time
import os
import gc
import json
import urllib
from urllib.error import HTTPError
import concurrent.futures
from matplotlib import pyplot as plt

from io import BytesIO
from PIL import Image

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.utils import shuffle

from skimage import io
from skimage.transform import rescale, resize, rotate

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dropout, Activation, Dense, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.applications import vgg16, vgg19, inception_v3, resnet50, mobilenet, xception, nasnet, densenet
from keras.utils.training_utils import multi_gpu_model
from keras.models import load_model

In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Define key parameters

In [20]:
## How many GPUs will you use in your server (need to be an even number)
number_of_gpus = 2

## How many images we want to download for the train dataset
train_number_images = 200000

## Define batch size that will be used for our CNN models
batch_size = 16

## Define folders where to save resized images downloaded from Kaggle provided urls
train_images_folder = 'train_images_resized'
validation_images_folder = 'val_images_resized'
test_images_folder = 'test_images_resized'

# Download images, resize them and save them in output folders

In [31]:
## Import json file and transform it into metadata file (image_id, labels_id, image_urls)

def load_metadata(json_name, is_test_data = False):
    temp_json = json.load(open(json_name))
    
    if not is_test_data:
        temp_metadata_urls = pd.DataFrame(temp_json['images'])
        temp_metadata_labels = pd.DataFrame(temp_json['annotations'])
        
        temp_metadata = pd.merge(temp_metadata_labels
                                 , temp_metadata_urls
                                 , how = 'inner'
                                 , on = 'imageId')

        temp_metadata.columns = ['image_id', 'labels_id', 'image_url']
        
        del (temp_metadata_urls, temp_metadata_labels)
    
    else:
        temp_metadata = pd.DataFrame(temp_json['images'])
        temp_metadata.columns = ['image_id', 'image_url']
        
    del (temp_json)
    
    return temp_metadata

In [36]:
## Load train, validation and test metadata

train_metadata = load_metadata('train.json', is_test_data = False)
val_metadata = load_metadata('validation.json', is_test_data = False)
test_metadata = load_metadata('test.json', is_test_data = True)

In [25]:
## Show example of how to load data from URL, resize it and display it

url = val_metadata['image_url'][10]
response = urllib.request.urlopen(url)
buf = BytesIO(response.read())
img = Image.open(buf)
img = np.array(img)
img_resized = resize(img, (112,112))
io.imshow(img_resized)

In [26]:
## Define function that will download images, resize them and save them in output folder
## This function takes one input that is a tuple of 3 elements so that it can be used by the multi-threading operation in next cell

def download_resize_save_img(image_id_url_folder):
    image_id, url, image_folder = image_id_url_folder
    response = urllib.request.urlopen(url)
    buf = BytesIO(response.read())
    img = Image.open(buf)
    img = np.array(img)
    img_resized = resize(img, (112,112))
    image_filename = str('image_id' + str(image_id) + '.jpg')
    image_path = os.path.join(image_folder, image_filename)
    io.imsave(image_path, img_resized)

In [27]:
## Define function that will download images in multi-threading

def download_images_from_metadata(metadata, output_folder, number_images):
        ## Create input list of tuples for the multi-threading operation below
    image_id_url_folder = list(zip(metadata['image_id'][:number_images]
                                   , metadata['image_url'][:number_images]
                                   , [output_folder]*number_images))

        ## Download images in multi-threading
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        executor.map(download_resize_save_img, image_id_url_folder)

In [39]:
## Download train images in multi-threading

download_images_from_metadata(train_metadata, train_images_folder, train_number_images)

In [30]:
## Download validation images in multi-threading

download_images_from_metadata(val_metadata, val_images_folder, val_metadata.shape[0])

In [52]:
## Download test images in multi-threading

download_images_from_metadata(test_metadata, test_images_folder, test_metadata.shape[0])

# Create training, validation and test datasets

In [55]:
def create_dataset_for_X_and_y(images_folder, number_images, metadata, is_test_data = False):
    data = []
    
    for image_filename in os.listdir(images_folder)[:number_images]:
        img = io.imread(os.path.join(images_folder, image_filename), as_grey=False)
        image_id = image_filename.split('.')[0] # image_filename is like "image_id1000.jpg"
        image_id = image_id.split("image_id")[1]
        data.append({'image_id': image_id
                       , 'image_array': img})
    
    data = pd.DataFrame(data)
    
    if not is_test_data: # for train and validation, we need to merge the labels to each observation
        data = pd.merge(data
                          , metadata
                          , how = 'inner'
                          , on = 'image_id')
    
    return data

In [41]:
## Create the train dataset that will be used for X_train and Y_train
    
    # Train data size needs to be a multiple of the batch size as each batch is divided equally to be sent to each GPU for training
    # If one of the GPU receives a too small batch or none training data, it will return NaN weights and the model will be unusable
train_data = create_dataset_for_X_and_y(train_images_folder, batch_size*11600, train_metadata, is_test_data = False)

## Create the X_train array
X_train = np.array(list(train_data['image_array']))
X_train = X_train.astype('float32')/255 ## So that the array has values between 0 and 1

## Create the y_train array
y_train = list(train_data['labels_id'])
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)

num_labels = y_train.shape[1]

In [42]:
## Create the validation dataset that will be used for X_val and Y_val

# number of observations needs to be a multiple of batch size so we choose number of images = batch_size*618
val_data = create_dataset_for_X_and_y(validation_images_folder, batch_size*618, val_metadata, is_test_data = False)

## Create the X_val array
X_val = np.array(list(val_data['image_array']))
X_val = X_val.astype('float32')/255

## Create the y_val array
y_val = list(val_data['labels_id'])
y_val = [list(set.intersection(set(labels), set(mlb.classes_))) for labels in y_val] # mlb breaks for unseen labels
y_val = mlb.transform(y_val)

In [53]:
## Create the test dataset that will be used for X_test

test_data = create_dataset_for_X_and_y(test_images_folder, 10, test_metadata, is_test_data = True)

## Create the X_val array
X_test = np.array(list(test_data['image_array']))
X_test = X_test.astype('float32')/255

# Build CNN models following below architecture:
## 1) Create first base learner using Xception architecture
## 2) Create second base learner using VGG16 architecture
## 3) Create a blending NN model taking as input the two base learners

### Xception base learner model

In [13]:
## Construct xception model

    ## Load base xception model

base_xception = xception.Xception(weights = 'imagenet', include_top = False, input_shape = X_train.shape[1:])

    ## If we want to freeze layers of base model so that they are not trained
# for layer in base_xception.layers:
#     layer.trainable = False

with tf.device("/cpu:0"):
    # initialize the model
    x = Flatten()(base_xception.output)
    # For a multi label classification, the activation function needs to be "sigmoid"
    predictions = Dense(num_labels, activation = 'sigmoid')(x)
    model_xception = Model(input = base_xception.input, output = predictions)

    # Make sure to keep a non-gpu version of the model so that it can be saved, the gpu and non-gpu model share the same weights
model_xception_gpu = multi_gpu_model(model_xception, gpus = number_of_gpus)

    # For a multi label classification, the loss needs to be "binary_crossentropy"
model_xception_gpu.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [12]:
model_xception_gpu.summary()

In [11]:
## Fit model

## use history = model.fit so you can plot later the loss plot
model_history = model_xception_gpu.fit(X_train, y_train,
                                       batch_size = 16,
                                       epochs= 10,
                                       validation_data = (X_val, y_val),
                                      )

In [15]:
# Plot training accuracy

plt.plot(model_history.history['acc'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

In [338]:
## Save xception model trained

model_xception.save('model_xception.h5')

In [16]:
## Load xception model trained

# model_xception = load_model('model_xception.h5')

# model_xception_gpu = multi_gpu_model(model_xception, gpus = number_of_gpus)

# model_xception_gpu.compile(loss='binary_crossentropy', # try hinge loss function
#                            optimizer=keras.optimizers.Adadelta(),
#                            metrics=['accuracy'])

In [437]:
## Calculate predictions in validation dataset

y_val_pred_xception = model_xception_gpu.predict(X_val)

In [17]:
## Predict labels

threshold = 0.22 # Choose different thresholds
y_val_pred_label_xception = y_val_pred_xception.copy()
y_val_pred_label_xception[y_val_pred_xception>=threshold] = 1
y_val_pred_label_xception[y_val_pred_xception<threshold] = 0

## Calculate F1 score micro average
f1_score_micro_xception = f1_score(y_true=y_val, y_pred=y_val_pred_label_xception, average='micro')
f1_score_micro_xception

In [110]:
## Find the best combination of thresholds that will give you the best F1-score via simulation

thresholds_to_try = np.random.rand(100000, num_labels)
best_thresholds_xception = []
best_f1_score_xception = 0
y_val_pred_label_xception = y_val_pred_xception.copy()

for list_thresholds in thresholds_to_try:
    y_val_pred_label_xception = y_val_pred_xception.copy()
    y_val_pred_label_xception[y_val_pred_xception >= list_thresholds] = 1
    temp_y_val_pred_label_xception[y_val_pred_xception < list_thresholds] = 0
    temp_f1_score = f1_score(y_true=y_val, y_pred=temp_y_val_pred_label_xception, average='micro')
    if temp_f1_score > best_f1_score:
        best_f1_score_xception = temp_f1_score
        best_thresholds_xception = list_thresholds
        y_val_pred_label_xception = temp_y_val_pred_label_xception

In [None]:
## Output best F1 score of Xception base learner

print(best_f1_score_xception)

### VGG16 base learner model

In [19]:
## Build VGG16 model

base_vgg16 = vgg16.VGG16(weights = 'imagenet', include_top = False, input_shape = X_train.shape[1:])

with tf.device("/cpu:0"):
    #initialize the model
    x = Flatten()(base_vgg16.output)
    predictions = Dense(num_labels, activation = 'sigmoid')(x)
    model_vgg16 = Model(input = base_vgg16.input, output = predictions)

model_vgg16_gpu = multi_gpu_model(model_vgg16, gpus = number_of_gpus)

model_vgg16_gpu.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [None]:
model_vgg16_gpu.summary()

In [None]:
## Fit model

model_vgg16_gpu.fit(X_train, y_train,
                     batch_size = 16,
                     epochs= 10,
                     validation_data = (X_val, y_val),
                    )

In [339]:
## Save model trained

model_vgg16.save('model_vgg16.h5')

In [20]:
## Load model

# model_vgg16 = load_model('model_vgg16.h5')

# model_vgg16_gpu = multi_gpu_model(model_vgg16, gpus = number_of_gpus)

# model_vgg16_gpu.compile(loss='binary_crossentropy', # try hinge loss function
#                            optimizer=keras.optimizers.Adadelta(),
#                            metrics=['accuracy'])

In [None]:
## Calculate predictions in validation dataset

y_val_pred_vgg16 = model_vgg16_gpu.predict(X_val)

In [3]:
## Predict labels

threshold = 0.22
y_val_pred_label_vgg16 = y_val_pred_vgg16.copy()
y_val_pred_label_vgg16[y_val_pred_vgg16>=threshold] = 1
y_val_pred_label_vgg16[y_val_pred_vgg16<threshold] = 0

## Calculate F1 score micro average
f1_score_micro_vgg16 = f1_score(y_true=y_val, y_pred=y_val_pred_label_vgg16, average='micro')
f1_score_micro_vgg16

In [None]:
## Output F1 score of VGG16 base learner model

print(f1_score_micro_vgg16)

### Blending model NN using two previous base learners as inputs
#### - The idea is to split the validation dataset into two datasets (50-50)
#### - First split will be used for training the blended model
#### - Second split will be used for validating the blended model

In [None]:
## Construct train and validation for blended model

blended_X_train, blended_X_val, blended_y_train, blended_y_val = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

blended_X_train = np.concatenate((model_xception_gpu.predict(blended_X_train)
                                   , model_vgg16_gpu.predict(blended_X_train))
                                  , axis = 1)
                                  
blended_X_val = np.concatenate((model_xception_gpu.predict(blended_X_val)
                                   , model_vgg16_gpu.predict(blended_X_val))
                                  ,axis = 1)

In [370]:
## Construct CNN model

with tf.device("/cpu:0"):
    #initialize the model
    blended_network = Sequential()
    blended_network.add(Dense(int(num_labels*1.5), input_dim=num_labels*2)) ## Number of neuros is mean of input and output dimension
    blended_network.add(Activation('relu'))
    blended_network.add(Dense(num_labels))
    blended_network.add(Activation('sigmoid'))

blended_network_gpu = multi_gpu_model(blended_network, gpus = number_of_gpus)

blended_network_gpu.compile(loss='binary_crossentropy', # try hinge loss function
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [1]:
blended_network.summary()

In [2]:
## Fit model

blended_network_gpu.fit(blended_X_train, blended_y_train,
                     batch_size = 16,
                     epochs= 40,
                     blended_data = (blended_X_val, blended_y_val),
                    )

In [None]:
## Save the blended model

blended_network.save('blended_network_model.h5')

In [493]:
## Calculate predictions for blended validation dataset

blended_y_val_pred = blended_network_gpu.predict(blended_X_val)

In [3]:
## Predict labels

blended_threshold = 0.325 # Choose different thresholds
blended_y_val_pred_label = blended_y_val_pred.copy()
blended_y_val_pred_label[blended_y_val_pred>=blended_threshold] = 1
blended_y_val_pred_label[blended_y_val_pred<blended_threshold] = 0

## Calculate F1 score micro average
blended_f1_score_micro = f1_score(y_true=blended_y_val, y_pred=blended_y_val_pred_label, average='micro')
blended_f1_score_micro

In [None]:
## Output F1 score of blended model

print(blended_f1_score_micro)

# Calculate predictions on test dataset and output Kaggle submission file

In [503]:
## Construct test dataset for blended model
blended_X_test = np.concatenate((model_xception_gpu.predict(X_test)
                                   , model_vgg16_gpu.predict(X_test))
                                  , axis = 1)

## Calculate predictions with blended model
blended_y_test_pred = blended_network_gpu.predict(blended_X_test)

## Predict labels for test data
blended_y_test_pred_label = blended_y_test_pred.copy()
blended_y_test_pred_label[blended_y_test_pred >= blended_threshold] = 1
blended_y_test_pred_label[blended_y_test_pred < blended_threshold] = 0

blended_y_test_labels = []
for result in blended_y_test_pred_label:
    blended_y_test_labels.append(' '.join(list(mlb.classes_[result == 1])))

blended_test_submission = pd.DataFrame({'image_id': list(test_data['image_id'])
                                , 'label_id': blended_y_test_labels
                               })

blended_test_submission.to_csv('test_submission_05182018_v4.csv')

In [504]:
## Output an example of label predictions from the blended model

print(mlb.classes_[blended_y_test_pred_label[0]==1])
print(mlb.classes_[y_val[0]==1])

['133' '170' '184' '222' '66' '78']
['133' '170' '222' '44' '66']
