In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 28 16:29:22 2018

@author: ldong
"""

from PIL import Image
from zipfile import ZipFile
import matplotlib.pyplot as plt
from skimage import feature
from multiprocessing import Pool
import numpy as np
import pandas as pd
from collections import defaultdict
import operator
import os, time
from scipy import ndimage as ndi
import cv2
os.environ['OMP_NUM_THREADS'] = '40'

In [2]:
num_cores = 8
num_partitions = num_cores

import gc

In [3]:
images_path = '../input/train_jpg_0'

files = os.listdir(images_path)
feats = pd.DataFrame({'image':files[:2000]})

In [4]:
def parallelize_dataframe(df, func):
    '''
    paralell processing
    '''
    df_list = np.array_split(df, num_partitions)
    pool = Pool(processes=num_cores)
    df = pd.concat(pool.map(func, df_list))
    pool.close()
    pool.join()
    return df

In [5]:
def parallelize_list(ls, func):
    ls_list = np.array_split(ls, num_partitions)
    pool = Pool(processes=num_cores)
    ls = np.concatenate(pool.map(func,ls_list))
    pool.close()
    pool.join()
    return ls

In [6]:
def load_img(img_names):
    imgs = []
    for i in img_names:
        with open(images_path+'/'+i,'rb') as f:
            tmp = Image.open(f)
            imgs.append(tmp.copy()) 
            f.close()
    return imgs

In [7]:
def flatten(l):
    try:
        return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]
    except IndexError:
        return []

In [8]:
ls_list = np.array_split(feats.image, num_partitions)
pool = Pool(processes=num_cores)
imgs = flatten(pool.map(load_img,ls_list))
feats['img_mat'] = imgs

In [9]:
gc.collect()

14

### Dullness

In [10]:
def color_analysis(img):
#    img = Image.open(images_path+'/'+img)
    
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

def perform_color_analysis(im, flag):
#    im = Image.open(images_path+'/'+im)
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        return None
    
    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    if flag == 'black':
        return dark_percent
    elif flag == 'white':
        return light_percent
    else:
        return None

### Uniformness

In [11]:
def average_pixel_width(im):
    im_array = np.asarray(im.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
    return apw*100

In [12]:
def get_dominant_color(img):
    img = np.float32(img)
    pixels = img.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(img.shape)

    dominant_color = palette[np.argmax(np.unique(labels)[1])]
    return dominant_color

In [13]:
def get_average_color(img): 
    img = np.float32(img)
    average_color = [img[:, :, i].mean() for i in range(img.shape[-1])]
    return average_color

In [14]:
def getSize(filename):
    filename = images_path + '/' + filename
    st = os.stat(filename)
    return st.st_size


In [15]:
def getDimensions(image):
    return image.size

In [16]:
def get_blurrness(image):
    image = np.float32(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_32F).var()
    return fm

In [17]:
def score_feats(data):
    data['dullness'] = data['img_mat'].apply(lambda x : perform_color_analysis(x, 'black'))
    data['whiteness'] = data['img_mat'].apply(lambda x : perform_color_analysis(x, 'white'))   
    data['apw'] = data['img_mat'].apply(lambda x : average_pixel_width(x))   
#     data['dominant_color'] = data['img_mat'].apply(lambda x : get_dominant_color(x))
    data['average_color'] = data['img_mat'].apply(lambda x : get_average_color(x))   
    data['size'] = data['image'].apply(getSize)   
    data['dim'] = data['img_mat'].apply(getDimensions)      
    data['blurrness'] = data['img_mat'].apply(lambda x: get_blurrness(x))   
    return data

In [18]:
start = time.time()
feats = parallelize_dataframe(feats, score_feats)
print("Feature creation time: %0.2f Minutes"%((time.time() - start)/60))

Feature creation time: 1.43 Minutes


In [19]:
gc.collect()

0

In [20]:
feats.head()

Unnamed: 0,image,img_mat,dullness,whiteness,apw,average_color,size,dim,blurrness
0,657ab4edbc0afc5b3d4ca734ab995d338fa6d726a1b3c2...,<PIL.Image.Image image mode=RGB size=480x360 a...,9.28,18.6,2.704861,"[103.723145, 92.70534, 87.92938]",37196,"(480, 360)",248.50618
1,0dd8c8d13d4be982b0341a484ad1cfae4212a3bc481bc0...,<PIL.Image.Image image mode=RGB size=480x360 a...,0.0,50.0,3.644097,"[162.63553, 167.62607, 183.44379]",26286,"(480, 360)",577.679504
2,c96081c247df86f9749c1571a5ac9c5ef551e156343563...,<PIL.Image.Image image mode=RGB size=640x360 a...,94.66,0.0,3.437066,"[94.5495, 98.09114, 92.416916]",67852,"(640, 360)",1219.286987
3,0abf6d598659d5d6169871b00970eb393ffe7d5d865252...,<PIL.Image.Image image mode=RGB size=480x360 a...,3.785,10.78,5.652199,"[122.74912, 138.33653, 127.19715]",67164,"(480, 360)",1802.658936
4,64f32128404401ae8ced3ebd451dd30b29be2ca6d5f730...,<PIL.Image.Image image mode=RGB size=270x480 a...,0.0,0.0,1.750772,"[88.18731, 129.46936, 160.99812]",34600,"(270, 480)",264.471283


In [21]:
feats.describe()

Unnamed: 0,dullness,whiteness,apw,size,blurrness
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,11.874273,14.322235,2.8163,37937.374,810.225216
std,22.576197,27.817311,1.502814,16053.144325,1062.095273
min,0.0,0.0,0.052126,3038.0,35.251225
25%,0.0,0.0,1.744925,26162.0,275.58905
50%,0.0,0.0,2.529173,36200.0,511.719788
75%,12.78125,12.73,3.593097,47688.75,936.872955
max,100.0,100.0,10.847222,120676.0,18602.091797


In [22]:
img_df = feats.drop(['img_mat'], axis= 1)

In [23]:
img_df.to_csv('img_df_0')

In [26]:
del img_df
gc.collect()

0