### Preprocessing dataset metadata

In [172]:
import pandas as pd
import re

ds_meta_files = {
    'bounding_boxes': ['image_guid', 'x', 'y', 'xh', 'yh'],
    'classes': ['id', 'name'],
    'hierarchy': ['id', 'parent_id'],
    'image_class_labels': ['image_guid', 'class_id'],
    'images': ['image_guid', 'relative_path'],
    'photographers': ['image_guid', 'name'],
    'sizes': ['image_guid', 'width', 'height']
}
ds_meta = {}

for ds_meta_file in ds_meta_files:
    with open ('data/%s.txt' % ds_meta_file, 'r' ) as f:
        content = f.read()
        
    for i in range(0, len(ds_meta_files[ds_meta_file]) - 1):
        content = re.sub('\ (.*)(\n|\Z)', r'|\1\2', content, flags = re.M)
    
    with open ('data/%s.csv' % ds_meta_file, 'w') as f:
        f.write(content)
    
    ds_meta[ds_meta_file] = pd.read_csv('data/%s.csv' % ds_meta_file, header=None, names=ds_meta_files[ds_meta_file], sep='|')

### Dataset analysis

In [180]:
for ds_meta_item in ds_meta:
    print('Quantity of %s: %s' % (ds_meta_item, ds_meta[ds_meta_item].count()[0]))

Quantity of bounding_boxes: 48562
Quantity of classes: 1011
Quantity of hierarchy: 1010
Quantity of image_class_labels: 48562
Quantity of images: 48562
Quantity of photographers: 48562
Quantity of sizes: 48562


![](assets/dataset.png)

### Preprocessing dataset

In [175]:
import os
import random
import shutil
import cv2
import numpy as np

def guid_from_filename(filename):
    if len(filename) < 32:
        raise Exception('Provided filename %s is too short. Expected at least 32 characters.' % filename)
    
    return '%s-%s-%s-%s-%s' % (filename[0:8], filename[8:12], filename[12:16], filename[16:20], filename[20:32])
            
def apply_bounding_box(in_path, out_path, class_subdirs=True):
    bbs = ds_meta['bounding_boxes']
    
    subdirs = [''] if not class_subdirs else os.listdir(in_path)
    
    for subdir in subdirs:
        in_subdir_p = os.path.join(*(in_path, subdir))
        out_subdir_p = os.path.join(*(out_path, subdir))
        os.makedirs(out_subdir_p, exist_ok=True)
        
        for item in os.listdir(in_subdir_p):
            guid = guid_from_filename(item)
            img = cv2.imread(os.path.join(*(in_subdir_p, item)))
            bb = bbs[bbs['image_guid'] == guid]
            
            x = int(bb['x'])
            xh = int(bb['xh'])
            y = int(bb['y'])
            yh = int(bb['yh'])
            
            img_c = img[y:y+yh, x:x+xh]
            cv2.imwrite(os.path.join(*(out_subdir_p, item)), img_c)

def apply_gabor_filter(in_path, out_path, class_subdirs=True):
    g_kernel = cv2.getGaborKernel((21, 21), 8.0, np.pi/4, 10.0, 0.5, 0, ktype=cv2.CV_32F)
    
    subdirs = [''] if not class_subdirs else os.listdir(in_path)
    
    for subdir in subdirs:
        in_subdir_p = os.path.join(*(in_path, subdir))
        out_subdir_p = os.path.join(*(out_path, subdir))
        os.makedirs(out_subdir_p, exist_ok=True)
        
        for item in os.listdir(in_subdir_p):
            img = cv2.imread(os.path.join(*(in_subdir_p, item)))
            img_f = cv2.filter2D(img, cv2.CV_8UC3, g_kernel)
            cv2.imwrite(os.path.join(*(out_subdir_p, item)), img_f)

def apply_tvt_split(path, train=0.7, test=0.3, validation=0.0, class_subdirs=True):
    dir_train_p = '%s_train' % path
    dir_validation_p = '%s_validation' % path
    dir_test_p = '%s_test' % path
    
    if os.path.exists(dir_train_p) and os.path.isdir(dir_train_p):
        shutil.rmtree(dir_train_p)
    if os.path.exists(dir_validation_p) and os.path.isdir(dir_validation_p):
        shutil.rmtree(dir_validation_p)
    if os.path.exists(dir_test_p) and os.path.isdir(dir_test_p):
        shutil.rmtree(dir_test_p)
        
    subdirs = [''] if not class_subdirs else os.listdir(path)
    
    for subdir in subdirs:
        subdir_p = os.path.join(*(path, subdir))
        subdir_list = os.listdir(subdir_p)
        
        random.shuffle(subdir_list)
        
        subdir_list_len = len(subdir_list)
        subdir_list_train_thld = int(subdir_list_len * train)
        subdir_list_validation_thld = int(subdir_list_len * validation) + subdir_list_train_thld
        
        subdir_list_train = subdir_list[:subdir_list_train_thld]
        subdir_list_validation = subdir_list[subdir_list_train_thld:subdir_list_validation_thld]
        subdir_list_test = subdir_list[subdir_list_validation_thld:]
        
        # prepare train part
        subdir_train_p = os.path.join(*(dir_train_p, subdir))
        os.makedirs(subdir_train_p, exist_ok=True)
        for subdir_list_train_item in subdir_list_train:
            src = os.path.join(subdir_p, subdir_list_train_item)
            dest = os.path.join(subdir_train_p, subdir_list_train_item)
            shutil.copyfile(src, dest)
            
        # prepare validation part
        if len(subdir_list_validation) > 0:
            subdir_validation_p = os.path.join(*(dir_validation_p, subdir))
            os.makedirs(subdir_validation_p, exist_ok=True)
            for subdir_list_validation_item in subdir_list_validation:
                src = os.path.join(subdir_p, subdir_list_validation_item)
                dest = os.path.join(subdir_validation_p, subdir_list_validation_item)
                shutil.copyfile(src, dest)
            
        # prepare test part
        subdir_test_p = os.path.join(*(dir_test_p, subdir))
        os.makedirs(subdir_test_p, exist_ok=True)
        for subdir_list_test_item in subdir_list_test:
            src = os.path.join(subdir_p, subdir_list_test_item)
            dest = os.path.join(subdir_test_p, subdir_list_test_item)
            shutil.copyfile(src, dest)

In [179]:
# BASE               SET_A
# + Bounding box  => SET_A_BB               (del)
# + Gabor filter  => SET_A_BB_GF            (del)
# + TVT splitting => SET_A_BB_GF_train
#                    SET_A_BB_GF_validation
#                    SET_A_BB_GF_test

apply_bounding_box('data/SET_A', 'data/SET_A_BB')
apply_gabor_filter('data/SET_A_BB', 'data/SET_A_BB_GF')
apply_tvt_split('data/SET_A_BB_GF')

### Building simple perceptron with Keras

In [181]:
import os

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense

In [182]:
batch_size = 16

train_labels = os.listdir('data/SET_A_BB_GF_train')

train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

train_generator = train_datagen.flow_from_directory(
    'data/SET_A_BB_GF_train',
    target_size=(150, 150),
    batch_size=batch_size,
    classes=train_labels)

model = Sequential()
model.add(Flatten(input_shape=(150, 150, 3))) 
model.add(Dense(64))
model.add(Dense(64))
model.add(Dense(len(train_labels)))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit_generator(
        train_generator,
        steps_per_epoch=2000 // batch_size,
        epochs=50)

Found 2103 images belonging to 50 classes.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50

KeyboardInterrupt: 

### References

* https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html
* BSIF http://www.ee.oulu.fi/~jkannala/bsif/bsif.pdf