In [28]:
from collections import defaultdict
from scipy.stats import itemfreq
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from skimage import feature
from PIL import Image as IMG
import numpy as np
import pandas as pd 
import operator
import cv2
import os 

from IPython.core.display import HTML 
from IPython.display import Image

images_path = "/Users/um003580/projects/kaggle/avito/input/train_jpg_0/"
imgs = os.listdir(images_path)

features = pd.DataFrame()
features['image'] = imgs
features.head()

Unnamed: 0,image
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a...
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c379092...
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac...
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04...
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855...


In [29]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278166 entries, 0 to 278165
Data columns (total 1 columns):
image    278166 non-null object
dtypes: object(1)
memory usage: 2.1+ MB


In [30]:
def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

In [31]:
def perform_color_analysis(img, flag):
    path = images_path + img 
    im = IMG.open(path) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        return None

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    if flag == 'black':
        return dark_percent
    elif flag == 'white':
        return light_percent
    else:
        return None

In [80]:
def extract_size(img):
    path = images_path + img 
    image_size = os.path.getsize(path)
    return image_size

In [None]:
from tqdm import tqdm
tqdm.pandas()
features['dullness'] = features['image'].progress_apply(lambda x : perform_color_analysis(x, 'black'))

100%|██████████| 278166/278166 [4:36:19<00:00, 16.64it/s]  


In [75]:
topdull = features.sort_values('dullness', ascending = False)
topdull.tail(5)

Unnamed: 0,image,dullness
113677,c69d6e79327d1ed1055d91bc7a4d8c1cffbb650084522fabe4fbddd173a83e64.jpg,0.0
113678,bdbea3e68f5bb8a46158b0053217e816ec76552bb78607f749a986e5933377b1.jpg,0.0
113679,20b25362c501e18560a31810d360cef124c6bf2c25eca8ebfe813fb34af99b05.jpg,0.0
113680,59de230d71716b289d305f1cdbcbb8a531f69d06dbec7fd32b866b9750d24839.jpg,0.0
278165,f6ed0f43be37261fe6184eb8f07bf1d76165a33df2bffd52498ba27c254bb1e3.jpg,0.0


In [72]:
topdull.query('index==36487').image

36487    ca2476d3a9122bdb08538c1ff3755f1eea174d592ac31603d552e6891b86e5af.jpg
Name: image, dtype: object

In [70]:
pd.set_option("display.max_colwidth", 80)

In [78]:
print(os.path.getsize(images_path + '/c69d6e79327d1ed1055d91bc7a4d8c1cffbb650084522fabe4fbddd173a83e64.jpg'))

60306


In [82]:
features['size'] = features['image'].progress_apply(lambda x : extract_size(x))



  0%|          | 0/278166 [00:00<?, ?it/s][A[A

  0%|          | 478/278166 [00:00<00:58, 4757.16it/s][A[A

  0%|          | 845/278166 [00:00<01:03, 4368.15it/s][A[A

  0%|          | 1277/278166 [00:00<01:03, 4353.37it/s][A[A

  1%|          | 1723/278166 [00:00<01:03, 4379.58it/s][A[A

  1%|          | 2138/278166 [00:00<01:04, 4307.16it/s][A[A

  1%|          | 2572/278166 [00:00<01:03, 4311.83it/s][A[A

  1%|          | 3016/278166 [00:00<01:03, 4349.28it/s][A[A

  1%|▏         | 3571/278166 [00:00<00:59, 4646.79it/s][A[A

  1%|▏         | 4033/278166 [00:00<00:59, 4638.02it/s][A[A

  2%|▏         | 4545/278166 [00:01<00:57, 4769.18it/s][A[A

  2%|▏         | 5039/278166 [00:01<00:56, 4818.79it/s][A[A

  2%|▏         | 5638/278166 [00:01<00:53, 5118.99it/s][A[A

  2%|▏         | 6276/278166 [00:01<00:49, 5438.06it/s][A[A

  2%|▏         | 6826/278166 [00:01<00:53, 5039.84it/s][A[A

  3%|▎         | 7440/278166 [00:01<00:50, 5324.90it/s][A[A

  3%|

 43%|████▎     | 119428/278166 [00:13<00:09, 17075.28it/s][A[A

 44%|████▎     | 121140/278166 [00:13<00:09, 16821.85it/s][A[A

 44%|████▍     | 122826/278166 [00:13<00:09, 16771.98it/s][A[A

 45%|████▍     | 124556/278166 [00:13<00:09, 16924.54it/s][A[A

 45%|████▌     | 126251/278166 [00:13<00:09, 16774.86it/s][A[A

 46%|████▌     | 127966/278166 [00:13<00:08, 16883.98it/s][A[A

 47%|████▋     | 129656/278166 [00:14<00:09, 16244.69it/s][A[A

 47%|████▋     | 131315/278166 [00:14<00:08, 16345.35it/s][A[A

 48%|████▊     | 133013/278166 [00:14<00:08, 16528.74it/s][A[A

 48%|████▊     | 134786/278166 [00:14<00:08, 16870.38it/s][A[A

 49%|████▉     | 136508/278166 [00:14<00:08, 16973.27it/s][A[A

 50%|████▉     | 138344/278166 [00:14<00:08, 17365.49it/s][A[A

 50%|█████     | 140156/278166 [00:14<00:07, 17584.13it/s][A[A

 51%|█████     | 141919/278166 [00:14<00:07, 17356.30it/s][A[A

 52%|█████▏    | 143658/278166 [00:14<00:07, 17194.09it/s][A[A

 52%|█████

In [83]:
features.

Unnamed: 0,image,dullness,size
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,61.565,31533
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,84.270,32325
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,30.185,35761
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,0.000,78887
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,38.270,36678
5,61b407b43978f90609db5b6a48d55333398aba100b3dacad7cfa91b746b2e7a7.jpg,50.000,38842
6,defec5b12169a830f6958af1801a79277ea4839143e668b8b8c76af87da27fe0.jpg,0.000,33858
7,532ab2a59c747e23b5d9f3308c0dccbbb458650fcae538892ca05cc2751942fd.jpg,0.000,31485
8,8ef80e090272458bdca3704a5e53a2bf94d57be5d36c7302472199a6cfc24d76.jpg,0.000,56500
9,dc8be85be59d1f2e48613733bf1d79087a3132c94678815f6cd68b1ad02ea175.jpg,0.000,26833


In [86]:
topsize = features.sort_values('size', ascending = False)
topsize.tail(5)

Unnamed: 0,image,dullness,size
210862,c06eb832491f185b89ca551848289457d122bab78b3f3088ef8b84a5907db9bb.jpg,0.0,1524
252544,3b744be50ae6619a8c6ca769d6d98913cbd7e2120f27152056dd6804031227db.jpg,0.0,1484
84398,e9088247624a33792dbb29c4ce150bb6108c3ed22f7087f0b8b502a9d0572983.jpg,0.0,1462
24917,b4d9887b7239303eca77aaff49ffa42f3a2511729a22b9f22dd16bdd115ca43d.jpg,0.0,1408
202529,8b2855782b344605d7a2a3006395ed066ca8e5b24b77137268d9b4e4d897be9b.jpg,0.0,1060


In [87]:
topsize.head(5)

Unnamed: 0,image,dullness,size
90095,4e62dd16fcff67dcfa993c0b7fdb296cacf4585769e782fb1c8a7025f5ef6f52.jpg,28.38,151868
7372,6737a6daaafe7b0c5b9aa4649f3ac8e6c357b234bee1b497d6efd6e5cd15a9df.jpg,0.0,148820
68783,0d95687c981988d112a336dbe62e4b38f448fa246c9f01fc479c13afc924b586.jpg,0.0,145864
43296,8c67544118c7267ab162add52d759cff2f65acca3cbc2efee7ce5368a0113851.jpg,0.0,140123
204031,8cba19095491c9769830650aed77632b1b7ef9f8e1e4c44012384eb6addaae0d.jpg,0.0,139848
