# todo
- multiprocessing


In [9]:
from collections import defaultdict
from scipy import ndimage as ndi
from skimage import feature
from PIL import Image as IMG
import numpy as np
import pandas as pd 
import operator
import cv2
import os 
import gc
from IPython.core.display import HTML 
from IPython.display import Image
from tqdm import tqdm
tqdm.pandas()

from myutils import timer, reduce_mem_usage

# global vars
images_path = '../input/ants/'

## const.
features_path = '../features/'
input_path = '../input/'



  0%|                                                                        | 1224/2011862 [01:40<41:45:19, 13.38it/s]

In [2]:
# define functions
def check_imgpath(img):
    return os.path.isfile(img)

def load_image(img, usecv2=False):
    path = img
    try:
        im = cv2.imread(path) if usecv2==True else IMG.open(path)
    except Exception as e:
        print('Cannot open img: ', img)    
    return im

def crop_horizontal(im):
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))
    return im1, im2

def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1

    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

def perform_color_analysis_black(img):
    if check_imgpath(img) ==False:
        return -1

    im = load_image(img)
    im1, im2 = crop_horizontal(im)

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        print('Calculation error')
        return None

    dark_percent = (dark_percent1 + dark_percent2)/2 
    return dark_percent

def perform_color_analysis_white(img):
    if check_imgpath(img) ==False:
        return -1

    im = load_image(img)
    im1, im2 = crop_horizontal(im)

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        print('Calculation error')
        return None

    light_percent = (light_percent1 + light_percent2)/2 
    return light_percent

def average_pixel_width(img):
    if check_imgpath(img) ==False:
        return -1

    im = load_image(img)    
    im_array = np.asarray(im.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
    return apw*100

def get_dominant_color(img):
    if check_imgpath(img) ==False:
        return [-1, -1, -1]

    im = load_image(img, usecv2=True)
    arr = np.float32(im)
    pixels = arr.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(im.shape)

    dominant_color = palette[np.argmax(np.unique(labels, return_counts=True)[1: ])]
    return dominant_color

def get_average_color(img):
    if check_imgpath(img) ==False:
        return [-1, -1, -1]

    im = load_image(img, usecv2=True)
    average_color = [im[:, :, i].mean() for i in range(im.shape[-1])]
    return average_color

def get_size(img):
    if check_imgpath(img) ==False:
        return -1

    st = os.stat(img)
    return st.st_size

def get_dimensions(img):
    if check_imgpath(img) ==False:
        return [-1, -1]

    im = load_image(img)
    img_size = im.size
    return img_size

def get_blurrness_score(img):
    if check_imgpath(img) ==False:
        return -1

    im = load_image(img, usecv2=True)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(im, cv2.CV_64F).var()
    return fm

def get_imagefeatures(features, imgcol, prefix=''):
    with timer('Image features extraction'):
        prefix = prefix + '_'
        features[prefix+'dullness'] = features[imgcol].progress_apply(perform_color_analysis_black)
        features[prefix+'whiteness'] = features[imgcol].progress_apply(perform_color_analysis_white)
        features[prefix+'average_pixel_width'] = features[imgcol].progress_apply(average_pixel_width)
        #features[prefix+'dominant_color'] = features[imgcol].progress_apply(get_dominant_color)
        #features[prefix+'dominant_red'] = features[prefix+'dominant_color'].progress_apply(lambda x: x[0]) / 255
        #features[prefix+'dominant_green'] = features[prefix+'dominant_color'].progress_apply(lambda x: x[1]) / 255
        #features[prefix+'dominant_blue'] = features[prefix+'dominant_color'].progress_apply(lambda x: x[2]) / 255
        #features.drop(prefix+'dominant_color', axis=1, inplace=True)
        features[prefix+'average_color'] = features[imgcol].progress_apply(get_average_color)
        features[prefix+'average_red'] = features[prefix+'average_color'].progress_apply(lambda x: x[0]) / 255
        features[prefix+'average_green'] = features[prefix+'average_color'].progress_apply(lambda x: x[1]) / 255
        features[prefix+'average_blue'] = features[prefix+'average_color'].progress_apply(lambda x: x[2]) / 255
        features.drop(prefix+'average_color', axis=1, inplace=True)
        features[prefix+'image_size'] = features[imgcol].progress_apply(get_size)
        features[prefix+'temp_size'] = features[imgcol].progress_apply(get_dimensions)
        features[prefix+'width'] = features[prefix+'temp_size'].progress_apply(lambda x : x[0])
        features[prefix+'height'] = features[prefix+'temp_size'].progress_apply(lambda x : x[1])
        features.drop(prefix+'temp_size', axis=1, inplace=True)
        features[prefix+'blurrness'] = features[imgcol].progress_apply(get_blurrness_score)
    
    gc.collect()
    return features

def save_features(features, filename):
    filepath = features_path + filename
    features.to_feather(filepath)


In [22]:
# Example: Calculate features
imgs = os.listdir(images_path)
features = pd.DataFrame()
features['imagepath'] = imgs
features['imagepath'] = features['imagepath'].progress_apply(lambda x: images_path+x)
features['imagepath'][0] = ""

features = get_imagefeatures(features, imgcol='imagepath', prefix='debug')

gc.collect()
features.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 13.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 13.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 19.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 208.11it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 11029.73it/s]
100%|███████████████████████████████████

[Image features extraction] done in 2 s


Unnamed: 0,imagepath,debug_dullness,debug_whiteness,debug_average_pixel_width,debug_average_red,debug_average_green,debug_average_blue,debug_image_size,debug_width,debug_height,debug_blurrness
0,,-1.0,-1.0,-1.0,-0.003922,-0.003922,-0.003922,-1,-1,-1,-1.0
1,../input/ants/11381045_b352a47d8c.jpg,0.0,0.0,1.934535,0.196553,0.399769,0.424831,36120,500,333,891.795181
2,../input/ants/119785936_dd428e40c3.jpg,0.0,3.235,2.52515,0.345592,0.450838,0.500565,57087,500,334,193.591465
3,../input/ants/147542264_79506478c2.jpg,0.0,4.22,2.411797,0.287481,0.561964,0.439412,54044,400,462,132.693869
4,../input/ants/17081114_79b9a27724.jpg,0.0,7.53,3.8992,0.555366,0.587042,0.596563,94240,375,500,10261.052169


In [4]:
# Example: Save featuers
features.drop('imagepath', axis=1, inplace=True)
save_features(features, filename='debug.feather')


In [33]:
trlist = os.listdir('../input/train_jpg/')
telist = os.listdir('../input/test_jpg/')

trlist = [str(t).split('.')[-1] for t in trlist]
telist = [str(t).split('.')[-1] for t in telist]

print(set(trlist))
print(set(telist))

{'jpg'}
{'jpg'}


In [3]:
train = pd.read_csv('../input/train.csv', usecols=['image'])
test = pd.read_csv('../input/test.csv', usecols=['image'])
lentrain = train.shape[0]

train['image'] = train['image'].fillna('')
test['image'] = test['image'].fillna('')
train['image'] = train['image'].apply(lambda x: input_path+'train_jpg/'+str(x)+'.jpg')
test['image'] = test['image'].apply(lambda x: input_path+'test_jpg/'+str(x)+'.jpg')

test.head()

Unnamed: 0,image
0,../input/test_jpg/a8b57acb5ab304f9c331ac7a0742...
1,../input/test_jpg/.jpg
2,../input/test_jpg/8c361112cb049745ef2d1b0ae735...
3,../input/test_jpg/.jpg
4,../input/test_jpg/bc3cf6deef10840fc302e38eb48f...


In [4]:
debugtrain = pd.DataFrame()
debugtrain['image'] = list(train.image[:5])
debugtest = pd.DataFrame()
debugtest['image'] = list(test.image[:5])
debugtrain = get_imagefeatures(debugtrain, imgcol='image', prefix='train')
debugtest = get_imagefeatures(debugtest, imgcol='image', prefix='test')

debugtrain.head()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 16.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 19.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 21.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 250.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5014.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5013.51it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5014.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5014.71it/s]
100%|███████████████████████████████████

[Image features extraction] done in 1 s


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 21.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 21.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 30.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 334.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]
100%|███████████████████████████████████

[Image features extraction] done in 1 s


Unnamed: 0,image,train_dullness,train_whiteness,train_average_pixel_width,train_average_red,train_average_green,train_average_blue,train_image_size,train_width,train_height,train_blurrness
0,../input/train_jpg/d10c7e016e03247a3bf2d13348f...,0.0,1.965,2.328911,0.360951,0.320277,0.612833,27039,358,480,398.428606
1,../input/train_jpg/79c9392cc51a9c81c6eb91eceb8...,6.335,0.0,3.273727,0.297394,0.366578,0.422213,30385,360,480,1014.477548
2,../input/train_jpg/b7f250ee3f39e1fedd77c141f27...,0.0,72.02,2.649518,0.703338,0.703571,0.703576,18681,392,360,493.921065
3,../input/train_jpg/e6ef97e0725637ea84e3d203e82...,0.0,94.325,1.54784,0.851712,0.846464,0.846093,13656,360,360,377.127677
4,../input/train_jpg/54a687a3a0fc1d68aed99bdaaf5...,7.715,1.32,2.469618,0.449327,0.512993,0.485592,36710,640,360,557.361892


In [5]:
debugtest.head()

Unnamed: 0,image,test_dullness,test_whiteness,test_average_pixel_width,test_average_red,test_average_green,test_average_blue,test_image_size,test_width,test_height,test_blurrness
0,../input/test_jpg/a8b57acb5ab304f9c331ac7a0742...,25.64,23.38,3.840856,0.408582,0.418109,0.416671,41520,480,360,313.459142
1,../input/test_jpg/.jpg,-1.0,-1.0,-1.0,-0.003922,-0.003922,-0.003922,-1,-1,-1,-1.0
2,../input/test_jpg/8c361112cb049745ef2d1b0ae735...,93.715,6.285,4.925781,0.341105,0.302781,0.27744,61898,640,360,755.577396
3,../input/test_jpg/.jpg,-1.0,-1.0,-1.0,-0.003922,-0.003922,-0.003922,-1,-1,-1,-1.0
4,../input/test_jpg/bc3cf6deef10840fc302e38eb48f...,53.82,0.0,3.765625,0.233821,0.273105,0.361182,49484,480,360,344.7516


In [6]:
del debugtrain, debugtest;gc.collect()

lentrain

1503424

In [7]:
df = pd.concat([train, test])
del train, test; gc.collect()
df.shape


(2011862, 1)

In [8]:
df = get_imagefeatures(df, imgcol='image', prefix='imgfeatures_')


  0%|                                                                        | 1223/2011862 [01:26<41:45:19, 13.38it/s]

KeyboardInterrupt: 