In [9]:
from collections import defaultdict
from scipy.stats import itemfreq
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from skimage import feature
from PIL import Image as IMG
import numpy as np
import pandas as pd 
import operator
import cv2
import os 

from IPython.core.display import HTML 
from IPython.display import Image

import multiprocessing
from multiprocessing import Pool

images_path = "/Users/um003580/projects/kaggle/avito/input/train_jpg_0/"
imgs = os.listdir(images_path)

features = pd.DataFrame()
features['image'] = imgs
features.head()

Unnamed: 0,image
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg


In [3]:
pd.set_option("display.max_colwidth", 80)

In [4]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278166 entries, 0 to 278165
Data columns (total 1 columns):
image    278166 non-null object
dtypes: object(1)
memory usage: 2.1+ MB


In [5]:
def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

In [6]:
def perform_color_analysis(img, flag):
    path = images_path + img 
    im = IMG.open(path) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        return None

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    if flag == 'black':
        return dark_percent
    elif flag == 'white':
        return light_percent
    else:
        return None

In [16]:
num_partitions = 4
num_cores = multiprocessing.cpu_count()

def parallelize_dataframe(df, func):
    a,b,c,d = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, [a,b,c,d]))
    pool.close()
    pool.join()
    return df

In [17]:
num_cores

4

In [7]:
from tqdm import tqdm
tqdm.pandas()

In [18]:
def score_dullness(data):
    data['dullness'] = data['image'].progress_apply(lambda x : perform_color_analysis(x, 'black'))   
    return data

In [None]:
features = parallelize_dataframe(features, score_dullness)
features.head()

 69%|██████▉   | 48203/69541 [3:07:30<2:30:24,  2.36it/s]      

In [75]:
topdull = features.sort_values('dullness', ascending = False)
topdull.tail(5)

Unnamed: 0,image,dullness
113677,c69d6e79327d1ed1055d91bc7a4d8c1cffbb650084522fabe4fbddd173a83e64.jpg,0.0
113678,bdbea3e68f5bb8a46158b0053217e816ec76552bb78607f749a986e5933377b1.jpg,0.0
113679,20b25362c501e18560a31810d360cef124c6bf2c25eca8ebfe813fb34af99b05.jpg,0.0
113680,59de230d71716b289d305f1cdbcbb8a531f69d06dbec7fd32b866b9750d24839.jpg,0.0
278165,f6ed0f43be37261fe6184eb8f07bf1d76165a33df2bffd52498ba27c254bb1e3.jpg,0.0


In [93]:
features['size'] = features['image'].progress_apply(lambda x : extract_size(x))



  0%|          | 0/278168 [00:00<?, ?it/s][A[A

  0%|          | 230/278168 [00:00<02:01, 2296.76it/s][A[A

  0%|          | 542/278168 [00:00<01:51, 2492.80it/s][A[A

  0%|          | 877/278168 [00:00<01:42, 2698.06it/s][A[A

  0%|          | 1234/278168 [00:00<01:35, 2907.92it/s][A[A

  1%|          | 1598/278168 [00:00<01:29, 3092.60it/s][A[A

  1%|          | 1965/278168 [00:00<01:25, 3243.11it/s][A[A

  1%|          | 2272/278168 [00:00<01:28, 3134.17it/s][A[A

  1%|          | 2609/278168 [00:00<01:26, 3197.01it/s][A[A

  1%|          | 2938/278168 [00:00<01:25, 3217.96it/s][A[A

  1%|          | 3255/278168 [00:01<01:26, 3176.52it/s][A[A

  1%|▏         | 3583/278168 [00:01<01:25, 3204.31it/s][A[A

  1%|▏         | 3938/278168 [00:01<01:23, 3298.96it/s][A[A

  2%|▏         | 4326/278168 [00:01<01:19, 3453.89it/s][A[A

  2%|▏         | 4673/278168 [00:01<01:19, 3423.49it/s][A[A

  2%|▏         | 5027/278168 [00:01<01:20, 3410.70it/s][A[A

  2%|▏

 81%|████████  | 224279/278168 [00:27<00:04, 11102.76it/s][A[A

 81%|████████  | 225420/278168 [00:27<00:05, 10273.42it/s][A[A

 82%|████████▏ | 226907/278168 [00:28<00:04, 11322.91it/s][A[A

 82%|████████▏ | 228376/278168 [00:28<00:04, 12157.15it/s][A[A

 83%|████████▎ | 229649/278168 [00:28<00:03, 12278.23it/s][A[A

 83%|████████▎ | 230987/278168 [00:28<00:03, 12588.12it/s][A[A

 84%|████████▎ | 232397/278168 [00:28<00:03, 13005.15it/s][A[A

 84%|████████▍ | 233722/278168 [00:28<00:03, 11413.01it/s][A[A

 84%|████████▍ | 235026/278168 [00:28<00:03, 11854.89it/s][A[A

 85%|████████▌ | 236590/278168 [00:28<00:03, 12781.99it/s][A[A

 86%|████████▌ | 237918/278168 [00:28<00:03, 12431.26it/s][A[A

 86%|████████▌ | 239198/278168 [00:29<00:03, 11924.74it/s][A[A

 86%|████████▋ | 240421/278168 [00:29<00:03, 11891.49it/s][A[A

 87%|████████▋ | 241632/278168 [00:29<00:03, 11737.64it/s][A[A

 87%|████████▋ | 243180/278168 [00:29<00:02, 12395.23it/s][A[A

 88%|█████

In [94]:
features.head()

Unnamed: 0,image,size
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,31533
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,32325
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,35761
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,78887
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,36678


In [95]:
topsize = features.sort_values('size', ascending = False)

In [96]:
topsize.tail(5)

Unnamed: 0,image,size
84398,e9088247624a33792dbb29c4ce150bb6108c3ed22f7087f0b8b502a9d0572983.jpg,1462
24917,b4d9887b7239303eca77aaff49ffa42f3a2511729a22b9f22dd16bdd115ca43d.jpg,1408
202531,8b2855782b344605d7a2a3006395ed066ca8e5b24b77137268d9b4e4d897be9b.jpg,1060
159067,4f029e2a00e892aa2cac27d98b52ef8b13d91471f613c8d3c38e3f29d4da0b0c.jpg,0
93595,8513a91e55670c709069b5f85e12a59095b802877715903abef16b7a6f306e58.jpg,0


In [97]:
topsize.head(5)

Unnamed: 0,image,size
90095,4e62dd16fcff67dcfa993c0b7fdb296cacf4585769e782fb1c8a7025f5ef6f52.jpg,151868
7372,6737a6daaafe7b0c5b9aa4649f3ac8e6c357b234bee1b497d6efd6e5cd15a9df.jpg,148820
68783,0d95687c981988d112a336dbe62e4b38f448fa246c9f01fc479c13afc924b586.jpg,145864
43296,8c67544118c7267ab162add52d759cff2f65acca3cbc2efee7ce5368a0113851.jpg,140123
204033,8cba19095491c9769830650aed77632b1b7ef9f8e1e4c44012384eb6addaae0d.jpg,139848


In [98]:
features.to_csv('train_jpg_0_size.csv')