# iMaterialist Challenge at FGVC 2017

Assign accurate description labels to images of apparel products.

Links:
    [iMaterialist Challenge at FGVC 2017](https://www.kaggle.com/c/imaterialist-challenge-FGVC2017)

In [106]:
import os
import json
from pprint import pprint
import numpy as np
import cv2
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer

Read task and labels from json files.

In [82]:
data_dir = '/home/khan/workspace/ml_ws/datasets/imat_dataset/'

def read_data():
    task_json = os.path.join(data_dir,'fgvc4_iMat.task_map.json')
    label_json = os.path.join(data_dir,'fgvc4_iMat.label_map.json')
    train_json = os.path.join(data_dir,'fgvc4_iMat.train.data.json')
    val_json = os.path.join(data_dir,'fgvc4_iMat.validation.data.json')
    test_json = os.path.join(data_dir,'fgvc4_iMat.test.image.json')

    with open(task_json) as task_json_file:    
        task_json = json.load(task_json_file)

    with open(label_json) as label_json_file:    
        label_json = json.load(label_json_file)
    
    with open(train_json) as train_json_file:    
        train_json = json.load(train_json_file)
        
    with open(val_json) as val_json_file:    
        val_json = json.load(val_json_file)
        
    with open(test_json) as test_json_file:    
        test_json = json.load(test_json_file)
        
    return task_json,label_json,train_json,val_json,test_json

task_json,label_json,train_json,val_json,test_json = read_data()

## All task-label pair for an image per example

Get task and label list for each image in train and validation dataset.

In [80]:
# get task and label pairs for each image idx
def get_task_labels(lst):
    img_tasks_labels = {}
    for row in lst:
        if (row['imageId'] not in img_tasks_labels):       
            task_label_set = {'dummy'}
            for row2 in lst:
                if (row2['imageId'] == row['imageId']):
                    task_label_set.add(str(row2['taskId']+'_'+row2['labelId']))   
            task_label_set.remove('dummy')
        img_tasks_labels[row['imageId']] = task_label_set
    return img_tasks_labels

    
# train_task_label = get_task_labels(train_json['annotations'])
val_task_label = get_task_labels(val_json['annotations'])
print("validation has task and label pairs:\t", len(val_task_label))

validation has task and label pairs:	 8432


Remove bad ids from task and labels

In [81]:
def remove_bad_ids(bad_file, dict_task_label, data_type):
    bad_file = open(bad_file, "r")
    bad_ids = bad_file.read().split('\n')
    print('bad ', data_type,' files:\t',len(bad_ids))
    print(data_type,' task-pair before bad file removal:\t',len(dict_task_label))

    for x in bad_ids:
        try:
            del dict_task_label[x]
        except:
            print('value not found.')
    print(data_type,' task-pair after bad file removal:\t',len(dict_task_label))
    
val_task_label = remove_bad_ids(os.path.join(data_dir,'bad_val.txt'), val_task_label, 'validation')
# train_task_label = remove_bad_ids(os.path.join(data_dir,'bad_train.txt'), train_task_label, 'train')
# test_task_label = remove_bad_ids(os.path.join(data_dir,'bad_test.txt'), test_task_label, 'test')


bad  validation  files:	 299
validation  task-pair before bad file removal:	 8432
value not found.
validation  task-pair after bad file removal:	 8134


Generate output variable train_Y and val_Y containing one hot encoded labels and save them as numpy array.

In [51]:
# train_tl = train_task_label.values()
# mlb_tr = MultiLabelBinarizer()
# train_Y = mlb_tr.fit_transform(train_tl)

val_tl = val_task_label.values()
mlb_val = MultiLabelBinarizer()
val_Y = mlb_val.fit_transform(val_tl)

# print('train_Y dimensions:\t', train_Y.shape)
print('val_Y dimensions:\t', val_Y.shape)

# np.save('data/train_Y.npy', train_Y)
np.save('data/val_Y.npy', val_Y)

val_Y dimensions:	 (8134, 576)


Remove bad ids i.e. images with no data

In [None]:
# train_dir = os.path.join(data_dir,'train_images/')
# val_dir = os.path.join(data_dir,'val_images/')
# test_dir = os.path.join(data_dir,'test_images/')

# def remove_bad_ids():
#     # Read images ids and remove bad data\
#     # train data
#     train_ids = [f for f in os.listdir(train_dir) if f.endswith('.jpg')]
#     train_ids = [os.path.splitext(f)[0] for f in train_ids]

#     train_bad_file = os.path.join(data_dir,'bad_train.txt')
#     train_bad_file = open(train_bad_file, "r")
#     train_bad_ids = train_bad_file.read().split('\n')

#     train_ids = [x for x in train_ids if x not in train_bad_ids]
    
#     # validation data    
#     val_ids = [f for f in os.listdir(val_dir) if f.endswith('.jpg')]
#     val_ids = [os.path.splitext(f)[0] for f in val_ids]

#     val_bad_file = os.path.join(data_dir,'bad_val.txt')
#     val_bad_file = open(val_bad_file, "r")
#     val_bad_ids = val_bad_file.read().split('\n')

#     val_ids = [x for x in val_ids if x not in val_bad_ids]
    
#     # test data
#     test_ids = [f for f in os.listdir(test_dir) if f.endswith('.jpg')]
#     test_ids = [os.path.splitext(f)[0] for f in test_ids]

#     test_bad_file = os.path.join(data_dir,'bad_test.txt')
#     test_bad_file = open(test_bad_file, "r")
#     test_bad_ids = test_bad_file.read().split('\n')

#     test_ids = [x for x in test_ids if x not in test_bad_ids]
    
#     print('training data:\t', len(train_ids))
#     print('validation data:\t', len(val_ids))
#     print('test data:\t', len(test_ids))
#     return train_ids, val_ids, test_ids

# train_ids, val_ids, test_ids = remove_bad_ids()

In [None]:
# test_count = len(test_ids)
# task_count = len(task_json['taskInfo'])
# label_count = len(label_json['labelInfo'])

# train_Y = np.zeros((len(train_json['annotations']),task_count*label_count))
# val_Y = np.zeros((len(val_json['annotations']),task_count*label_count))

# train_image_ids = []
# idx = 0
# for label in train_json['annotations']:
#     task_id = int(label['taskId'])-1
#     label_id = int(label['labelId'])-1
#     train_Y[idx][task_id*label_id] = 1
#     train_image_ids.append(label['imageId'])
#     idx += 1
    
# val_image_ids = []
# idx = 0
# for label in val_json['annotations']:
#     task_id = int(label['taskId'])-1
#     label_id = int(label['labelId'])-1
#     val_Y[idx][task_id*label_id] = 1
#     val_image_ids.append(label['imageId'])
#     idx += 1
    
# print('train_Y dimensions:\t', train_Y.shape)
# print('val_Y dimensions:\t', val_Y.shape)

# np.save('data/train_Y.npy', train_Y)
# np.save('data/val_Y-.npy', val_Y)

Read images in train_X, val_X and test_X

In [69]:
IMG_SIZE_PX = 100

def load_images(img_dir, img_ids):
    X = []
    for i, image_name in enumerate(img_ids):
        if i % 1000 == 0:
            print(i)
        img = cv2.imread(os.path.join(img_dir,image_name +'.jpg'))
        img = cv2.resize(img, (IMG_SIZE_PX, IMG_SIZE_PX), interpolation = cv2.INTER_AREA)
        X.append(img)
    return X

In [70]:
train_dir = os.path.join(data_dir,'train_images/')
val_dir = os.path.join(data_dir,'val_images/')
test_dir = os.path.join(data_dir,'test_images/')
IMG_SIZE_PX = 100
 
# train_X = load_images(train_dir, train_ids)
# np.save('data/train_X-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,3), train_X)
# print('training data saved.')

val_X = load_images(val_dir, val_task_label.keys())
np.save('data/val_X-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,3), val_X)
print('validation data saved.')

# test_X = load_images(test_dir, test_ids)
# np.save('data/test_X-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,3), test_X)
# print('testing data saved.')

0
1000
2000
3000
4000
5000
6000
7000
8000
validation data saved.


Load saved images

In [None]:
# train_X = np.load('train_X-100-100-3.npy')
val_X = np.load('data/val_X-100-100-3.npy')
# test_X = np.load('test_X-100-100-3.npy')

In [None]:
img = cv2.imread('data/scene.png')
resized_image = cv2.resize(img, (100, 100), interpolation = cv2.INTER_AREA)

## One task-label pair per example

Get task and label list for each image in train and validation dataset.

In [114]:
# get task and label pairs for each image idx and remove bad_ids
def get_task_labels(lst, bad_file, data_type):
    bad_file = open(bad_file, "r")
    bad_ids = bad_file.read().split('\n')
    print('bad', data_type,'files:\t',len(bad_ids))
    print(data_type,'task-pair before bad file removal:\t',len(lst))
    
    img_ids = []
    img_tasks_labels = []
    for row in lst:
        if (row['imageId'] not in bad_ids):
            img_ids.append(row['imageId'])
            img_tasks_labels.append(row['taskId']+'_'+row['labelId'])
    print(data_type, "after bad file removal:\t", len(img_tasks_labels))
    return img_ids, img_tasks_labels

    
train_ids, train_task_label = get_task_labels(train_json['annotations'], os.path.join(data_dir,'bad_train.txt'), 'train')
# val_ids, val_task_label = get_task_labels(val_json['annotations'], os.path.join(data_dir,'bad_val.txt'), 'validation')
# test_ids, test_task_label = get_task_labels(test_json['annotations'], os.path.join(data_dir,'bad_test.txt'), 'test')

bad train files:	 1636
train task-pair before bad file removal:	 62088
train after bad file removal:	 59885


Generate output variable train_Y and val_Y containing one hot encoded labels and save them as numpy array.

In [115]:
lb_tr = LabelBinarizer()
train_Y = lb_tr.fit_transform(train_task_label)
print('train_Y dimensions:\t', train_Y.shape)
np.save('data/train_Y.npy', train_Y)

# lb_val = LabelBinarizer()
# val_Y = lb_val.fit_transform(val_task_label)
# print('val_Y dimensions:\t', val_Y.shape)
# np.save('data/val_Y.npy', val_Y)

train_Y dimensions:	 (59885, 576)


Read images in train_X, val_X and test_X

In [108]:
IMG_SIZE_PX = 100

def load_images(img_dir, img_ids):
    X = []
    for i, image_name in enumerate(img_ids):
        if i % 1000 == 0:
            print(i)
        img = cv2.imread(os.path.join(img_dir,image_name +'.jpg'))
        img = cv2.resize(img, (IMG_SIZE_PX, IMG_SIZE_PX), interpolation = cv2.INTER_AREA)
        X.append(img)
    return X

In [116]:
train_dir = os.path.join(data_dir,'train_images/')
val_dir = os.path.join(data_dir,'val_images/')
test_dir = os.path.join(data_dir,'test_images/')
IMG_SIZE_PX = 100
 
# train_X = load_images(train_dir, train_ids)
# np.save('data/train_X-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,3), train_X)
# print('training data saved.')

# val_X = load_images(val_dir, val_ids)
# np.save('data/val_X-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,3), val_X)
# print('validation data saved.')

# test_X = load_images(test_dir, test_ids)
# np.save('data/test_X-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,3), test_X)
# print('testing data saved.')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
training data saved.
