## Library Import

In [1]:
import pandas as pd
import numpy as np

# for NN composition (TODO)
#from keras import backend as K
#from keras.models import Sequential
#from keras.layers import Conv2D, Dropout, BatchNormalization, Dense, MaxPool2D, Flatten
#from keras.callbacks import EarlyStopping, ModelCheckpoint

# for image preprocessing/plotting
import matplotlib.pyplot as plt
import skimage 
from scipy import ndimage, misc

# for os manipulations
import os
import errno
import shutil

%matplotlib inline

# Data Import

In [2]:
# data folder paths
dir_train = os.path.join('data', 'train')
dir_test = os.path.join('data', 'test')

# collect train metadata
train_metadata = []

for root, subFolders, files in os.walk(dir_train):
    for sf in subFolders:
        print('reading files from {}...'.format(sf))
        for f_name in os.listdir(os.path.join(dir_train, sf)):
            if not f_name.startswith('.'):
                temp = misc.imread(os.path.join(dir_train, sf, f_name)) # read image
                # collect image metadata
                image_metadata = []
                image_metadata.extend([sf, f_name])
                image_metadata.extend( list(temp.shape) if len(temp.shape) == 3\
                                      else [temp.shape[0], temp.shape[1], 1])
                image_metadata.extend([temp.nbytes, temp.dtype])
                # append image metadata to list
                train_metadata.append(image_metadata)

reading files from 001.ak47...
reading files from 002.american-flag...
reading files from 003.backpack...
reading files from 004.baseball-bat...
reading files from 005.baseball-glove...
reading files from 006.basketball-hoop...
reading files from 007.bat...
reading files from 008.bathtub...
reading files from 009.bear...
reading files from 010.beer-mug...
reading files from 011.billiards...
reading files from 012.binoculars...
reading files from 013.birdbath...
reading files from 014.blimp...
reading files from 015.bonsai-101...
reading files from 016.boom-box...
reading files from 017.bowling-ball...
reading files from 018.bowling-pin...
reading files from 019.boxing-glove...
reading files from 020.brain-101...
reading files from 021.breadmaker...
reading files from 022.buddha-101...
reading files from 023.bulldozer...
reading files from 024.butterfly...
reading files from 025.cactus...
reading files from 026.cake...
reading files from 027.calculator...
reading files from 028.camel...

In [3]:
# make it more tidy-looking
train_metadata = pd.DataFrame(train_metadata)
train_metadata.columns = ['category','img_name','height','width','channels','byte_size','bit_depth']

# add some features =============================================================================
# select integer target (1-257)
train_metadata['target'] = train_metadata['category'].str.split('.').apply(lambda x: int(x[0]))
# image extension
train_metadata['img_extension'] = train_metadata['img_name'].str.split('.').apply(lambda x: x[1])
# just name without category indexes
train_metadata['category_name'] = \
    train_metadata['category'].str.split('.').apply(lambda x: x[1]).str.lower()
# img resolution
train_metadata['img_resolution'] = (
    train_metadata['width'].astype(str) + 'x' + train_metadata['height'].astype(str)
).astype('category')
# img shape/orientation
train_metadata['img_shape'] = train_metadata.apply(lambda r:\
    'horizontal' if r['height'] < r['width']\
    else 'vertical' if r['height'] > r['width']\
    else 'squared', axis=1)
# img size class (small/medium/big)
train_metadata['size'] = train_metadata.apply(lambda r:\
    'small' if r['height']*r['width'] <= 256**2\
    else 'medium' if r['height']*r['width'] <= 512**2\
    else 'big', axis=1)

# let's have a look at it now
train_metadata.head(5)

Unnamed: 0,category,img_name,height,width,channels,byte_size,bit_depth,target,img_extension,category_name,img_resolution,img_shape,size
0,001.ak47,001_0001.jpg,278,499,3,416166,uint8,1,jpg,ak47,499x278,horizontal,medium
1,001.ak47,001_0002.jpg,218,268,3,175272,uint8,1,jpg,ak47,268x218,horizontal,small
2,001.ak47,001_0003.jpg,186,300,3,167400,uint8,1,jpg,ak47,300x186,horizontal,small
3,001.ak47,001_0004.jpg,185,250,3,138750,uint8,1,jpg,ak47,250x185,horizontal,small
4,001.ak47,001_0005.jpg,200,380,3,228000,uint8,1,jpg,ak47,380x200,horizontal,medium


## Exploratory Data Analysis

In [4]:
# check for NaNs / missing values - all looks good
train_metadata.isnull().sum()

category          0
img_name          0
height            0
width             0
channels          0
byte_size         0
bit_depth         0
target            0
img_extension     0
category_name     0
img_resolution    0
img_shape         0
size              0
dtype: int64

In [5]:
# check for 1-channel (gray) images: there are about 300 grayscale images (1.3%)
train_metadata.channels.value_counts()

3    22599
1      298
Name: channels, dtype: int64

In [6]:
# check for file extensions/bit depth - only .JPG, 8 bit per channel (0-255)
print(train_metadata.img_extension.value_counts())
print(train_metadata.bit_depth.value_counts())

jpg    22897
Name: img_extension, dtype: int64
uint8    22897
Name: bit_depth, dtype: int64


In [7]:
# check for class balance (for ideal balance -> 100%/257 = 0.389%)
print('most popular class - {}: {:.2f}%'.format(
    train_metadata.target.value_counts(normalize=True).argmax(),
    train_metadata.target.value_counts(normalize=True).max()*100
))
print('least popular class - {}: {:.2f}%'.format(
    train_metadata.target.value_counts(normalize=True).argmin(),
    train_metadata.target.value_counts(normalize=True).min()*100
))

most popular class - 257: 3.48%
least popular class - 99: 0.22%


In [8]:
# top-5 common resolutions (in %)
train_metadata.img_resolution.value_counts(normalize=True)[:5]*100

640x480    2.432633
200x200    1.528585
300x225    0.952090
300x300    0.873477
400x300    0.860375
Name: img_resolution, dtype: float64

In [9]:
# image orientation (in %)
train_metadata.img_shape.value_counts(normalize=True)*100

horizontal    62.484168
vertical      29.650173
squared        7.865659
Name: img_shape, dtype: float64

In [10]:
# image size category (in %)
train_metadata['size'].value_counts(normalize=True)*100

medium    46.599991
small     39.118662
big       14.281347
Name: size, dtype: float64

### Naive Image Handler

In [11]:
def make_sure_path_exists(path):
    if os.path.exists(path):
        shutil.rmtree(path, ignore_errors=True)
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

def data_handler(path_to_folder,
                 subfolders=False,
                 transformation_list=None, # TODO
                 output_dimension=(256, 256),
                 channels=3,
                 save_as_path=None):
    
    # create folder for transformed data if not exists
    if save_as_path is not None:
        make_sure_path_exists(save_as_path)
    
    # for all images - process them and save
    if subfolders: # subfolders for each img class
        for root, subFolders, files in os.walk(path_to_folder):
            for sf in subFolders:
                print('Transforming files from {}...'.format(sf))
                for f_name in os.listdir(os.path.join(path_to_folder, sf)):
                    if not f_name.startswith('.'):
                        temp = misc.imread(os.path.join(path_to_folder, sf, f_name)) # read image
                        # if it's grayscale (1 channel) - then naively "extend" it to 3 (24bit depth)
                        if len(temp.shape) == 2:
                            temp = np.stack((temp,)*3)
                        # save image...
                        misc.imsave(
                            os.path.join(save_as_path, f_name),
                            arr=misc.imresize(temp, size=output_dimension, interp='bicubic')
                        )
    else: # flattened structure
        print('Transforming files from {}...'.format(path_to_folder))
        for f_name in os.listdir(path_to_folder):
            if not f_name.startswith('.'):
                temp = misc.imread(os.path.join(path_to_folder, f_name)) # read image
                # if it's grayscale (1 channel) - then naively "extend" it to 3 (24bit depth)
                if len(temp.shape) == 2:
                    temp = np.stack((temp,)*3)
                # save image...
                misc.imsave(
                    os.path.join(save_as_path, f_name),
                    arr=misc.imresize(temp, size=output_dimension, interp='bicubic')
                )

In [12]:
%%time
# directory to hold flattened data (without subfolders)
dir_train_transformed = os.path.join('data', 'train_transformed')

# transform data and re-save it in transformed folder
data_handler(
    path_to_folder=dir_train,
    subfolders=True,
    output_dimension=(64, 64),
    save_as_path=dir_train_transformed
)

Transforming files from 001.ak47...
Transforming files from 002.american-flag...
Transforming files from 003.backpack...
Transforming files from 004.baseball-bat...
Transforming files from 005.baseball-glove...
Transforming files from 006.basketball-hoop...
Transforming files from 007.bat...
Transforming files from 008.bathtub...
Transforming files from 009.bear...
Transforming files from 010.beer-mug...
Transforming files from 011.billiards...
Transforming files from 012.binoculars...
Transforming files from 013.birdbath...
Transforming files from 014.blimp...
Transforming files from 015.bonsai-101...
Transforming files from 016.boom-box...
Transforming files from 017.bowling-ball...
Transforming files from 018.bowling-pin...
Transforming files from 019.boxing-glove...
Transforming files from 020.brain-101...
Transforming files from 021.breadmaker...
Transforming files from 022.buddha-101...
Transforming files from 023.bulldozer...
Transforming files from 024.butterfly...
Transforming

# Naive Classification

### Load data

In [13]:
%%time
# load data in flattened form

# img parameters
img_shape = (64, 64)
channels = 3
img_shape_flattened = img_shape[0] * img_shape[1] * channels
img_qty = len(train_metadata)
img_dtype = np.int8

# initialize X,y
X = np.empty(shape=(img_qty, img_shape_flattened), dtype=img_dtype)
y = np.empty(shape=(img_qty,), dtype=np.uint16)

# read images
for i,f_name in enumerate(os.listdir(dir_train_transformed)):
    if i % 5000 == 0:
        print('{:6d}/{:6d} images loaded'.format(i, img_qty))
    
    img_path = os.path.join(dir_train_transformed, f_name)
    X[i, :] = misc.imread(img_path).flatten('C') # since img is np.ndarray, flatten in row-style
    y[i] = train_metadata[train_metadata.img_name == f_name].target

     0/ 22897 images loaded
  5000/ 22897 images loaded
 10000/ 22897 images loaded
 15000/ 22897 images loaded
 20000/ 22897 images loaded
Wall time: 47 s


In [14]:
# check memory consumption
print('X size in Mb: {:.2f}'.format(X.nbytes/2**20))
print('y size in Mb: {:.2f}'.format(y.nbytes/2**20))
# check shapes
print(X.shape, y.shape)

X size in Mb: 268.32
y size in Mb: 0.04
(22897, 12288) (22897,)


### Train/validation split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.4,
                                                  stratify=y, # to preserve class balance
                                                  random_state=42 # for reproducibility
                                                 )
# drop redundant matrices
del X, y

### Model Preparation

In [26]:
%%time
from sklearn.linear_model import LogisticRegression  # baseline classifier
from sklearn.preprocessing import StandardScaler # for data standartization ~ N(0,1)
from sklearn.decomposition import PCA # to reduce training columns
from sklearn.pipeline import Pipeline # to "glue" model steps/components together
from sklearn.metrics import accuracy_score # to test quality of the classifier


# define components
sc = StandardScaler()
pca = PCA(n_components=50, random_state=42)
svc = LogisticRegression(random_state=42, 
                         C=0.1, 
                         max_iter=50, 
                         verbose=2, 
                         n_jobs=-1,
                         class_weight='balanced'
                        )

# create model
model = Pipeline(
    (
        ('scaler', sc), # data standartization ~ N(0,1)
        ('dim_reduction', pca), # firstly - reduce dimension 
        ('classifier', svc) # then - classify (multiclass)
    )
)

# train model
model.fit(X_train, y_train)
print('Accuracy on train: {}'.format(accuracy_score(y_train, model.predict(X_train))))
# predict validation
print('Accuracy on validation: {}'.format(accuracy_score(y_val, model.predict(X_val))))



[LibLinear]Accuracy on train: 0.1313873926335711
Accuracy on validation: 0.06594606398078393
Wall time: 2min 34s


### Test Data Preparation

In [21]:
%%time
# directory to hold flattened test data (without subfolders)
dir_test_transformed = os.path.join('data', 'test_transformed')

# transform test data and re-save it in transformed folder
data_handler(
    path_to_folder=dir_test,
    subfolders=False,
    output_dimension=(64,64),
    save_as_path=dir_test_transformed
)

Transforming files from data\test...
Wall time: 46.7 s


In [22]:
# form X_test matrix
test_img_qty = len(os.listdir(dir_test_transformed))
X_test = np.empty(shape=(test_img_qty, img_shape_flattened), dtype=img_dtype)

# read images
for i,f_name in enumerate(os.listdir(dir_test_transformed)):
    if i % 2500 == 0:
        print('{:6d}/{:6d} images loaded'.format(i, test_img_qty))
    
    img_path = os.path.join(dir_test_transformed, f_name)
    X_test[i, :] = misc.imread(img_path).flatten('C') # since img is np.ndarray, flatten in row-style

     0/  7710 images loaded
  2500/  7710 images loaded
  5000/  7710 images loaded
  7500/  7710 images loaded


### Make Prediction

In [27]:
# make prediction
y_test_pred = model.predict(X_test)

# create submission dataframe (in specified kaggle format)
sub = pd.DataFrame(
    data=[row for row in zip(os.listdir(dir_test_transformed), y_test_pred.astype(int).tolist())],
    columns=['image', 'class']
)

# save it to .csv
sub.to_csv('lr_baseline.csv', encoding='utf-8', index=False)

