In [10]:
from collections import defaultdict
from scipy.stats import itemfreq
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from skimage import feature
from PIL import Image as IMG
import numpy as np
import pandas as pd 
import operator
import cv2
import os 

from IPython.core.display import HTML 
from IPython.display import Image

import multiprocessing
from multiprocessing import Pool

images_path = "/Users/um003580/projects/kaggle/avito/input/train_jpg_4/"

In [11]:
imgs = os.listdir(images_path)

features = pd.DataFrame()
features['image'] = imgs
features.head()

Unnamed: 0,image
0,b85f7e2088321d34aff002e5dd6cde4dce5b186a2f048feff606141e4b0ab500.jpg
1,15a19f383570561ce90d110afc0bd7f13123cf885d3f72a9028b5da6e99ecfed.jpg
2,6addef7755e5ae0305d0da5c684c8e819ac176a0f6cc826bca7aae2918bba643.jpg
3,611d12b4e684a4b889fe38e6a44a0ef44c09495688e4768d9bc55b9b9f792fa4.jpg
4,e4043a70d6eac75f031c44ea97557a5d7e6297d962adbb74c799cf9b56e69218.jpg


In [12]:
pd.set_option("display.max_colwidth", 80)

In [13]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278167 entries, 0 to 278166
Data columns (total 1 columns):
image    278167 non-null object
dtypes: object(1)
memory usage: 2.1+ MB


In [14]:
def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

In [15]:
def perform_color_analysis(img, flag):
    path = images_path + img 
    im = IMG.open(path) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        return None

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    if flag == 'black':
        return dark_percent
    elif flag == 'white':
        return light_percent
    else:
        return None

In [16]:
num_partitions = 4
num_cores = multiprocessing.cpu_count()

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [17]:
num_cores

4

In [18]:
from tqdm import tqdm
tqdm.pandas()

In [19]:
def score_dullness(data):
    data['dullness'] = data['image'].progress_apply(lambda x : perform_color_analysis(x, 'black'))   
    return data
def score_whiteness(data):
    data['whiteness'] = data['image'].progress_apply(lambda x : perform_color_analysis(x, 'white'))   
    return data

In [None]:
features = parallelize_dataframe(features, score_dullness)
features.head()

 14%|█▍        | 9952/69542 [22:20<2:09:17,  7.68it/s]]

In [20]:
topdull = features.sort_values('dullness', ascending = False)
topdull.tail(5)

Unnamed: 0,image,dullness
113677,c69d6e79327d1ed1055d91bc7a4d8c1cffbb650084522fabe4fbddd173a83e64.jpg,0.0
113678,bdbea3e68f5bb8a46158b0053217e816ec76552bb78607f749a986e5933377b1.jpg,0.0
113679,20b25362c501e18560a31810d360cef124c6bf2c25eca8ebfe813fb34af99b05.jpg,0.0
113680,59de230d71716b289d305f1cdbcbb8a531f69d06dbec7fd32b866b9750d24839.jpg,0.0
278165,f6ed0f43be37261fe6184eb8f07bf1d76165a33df2bffd52498ba27c254bb1e3.jpg,0.0


In [22]:
features.to_csv('train_jpg_0.csv')

In [21]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278167 entries, 0 to 278166
Data columns (total 1 columns):
image    278167 non-null object
dtypes: object(1)
memory usage: 2.1+ MB


In [None]:
features = parallelize_dataframe(features, score_whiteness)

100%|██████████| 69542/69542 [3:10:52<00:00,  6.07it/s]      
100%|██████████| 69542/69542 [3:10:53<00:00,  8.86it/s]
100%|██████████| 69541/69541 [3:10:56<00:00, 12.16it/s]
100%|██████████| 69541/69541 [3:10:59<00:00, 14.66it/s]


In [26]:
features.head(30)

Unnamed: 0,image,dullness,whiteness
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,61.565,0.0
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,84.27,0.0
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,30.185,4.405
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,0.0,60.445
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,38.27,0.0
5,61b407b43978f90609db5b6a48d55333398aba100b3dacad7cfa91b746b2e7a7.jpg,50.0,0.0
6,defec5b12169a830f6958af1801a79277ea4839143e668b8b8c76af87da27fe0.jpg,0.0,0.0
7,532ab2a59c747e23b5d9f3308c0dccbbb458650fcae538892ca05cc2751942fd.jpg,0.0,0.0
8,8ef80e090272458bdca3704a5e53a2bf94d57be5d36c7302472199a6cfc24d76.jpg,0.0,11.755
9,dc8be85be59d1f2e48613733bf1d79087a3132c94678815f6cd68b1ad02ea175.jpg,0.0,4.41


In [49]:
features.to_csv('train_jpg_0.csv')

In [28]:
def average_pixel_width(img):
    path = images_path + img 
    im = IMG.open(path)    
    im_array = np.asarray(im.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
    return apw*100

In [30]:
def score_pixel(data):
    data['average_pixel_width'] = data['image'].progress_apply(lambda x : average_pixel_width(x))   
    return data

In [31]:
features = parallelize_dataframe(features, score_pixel)

100%|██████████| 69541/69541 [1:34:05<00:00, 12.32it/s]  
100%|██████████| 69542/69542 [1:34:08<00:00, 12.31it/s]
100%|██████████| 69541/69541 [1:34:09<00:00, 12.31it/s]
100%|██████████| 69542/69542 [1:34:10<00:00, 18.90it/s]


In [32]:
features.head()

Unnamed: 0,image,dullness,whiteness,average_pixel_width
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,61.565,0.0,4.760995
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,84.27,0.0,2.172454
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,30.185,4.405,5.549972
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,0.0,60.445,3.159288
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,38.27,0.0,3.647377


In [34]:
def get_average_color(img):
    path = images_path + img 
    img = cv2.imread(path)
    average_color = [img[:, :, i].mean() for i in range(img.shape[-1])]
    return average_color
def score_color(data):
    data['average_color'] = data['image'].progress_apply(lambda x : get_average_color(x))   
    return data

In [35]:
features = parallelize_dataframe(features, score_color)

100%|██████████| 69542/69542 [12:14<00:00, 94.64it/s]]
100%|██████████| 69541/69541 [12:15<00:00, 94.51it/s]]
100%|██████████| 69542/69542 [12:17<00:00, 94.27it/s] 
100%|██████████| 69541/69541 [12:23<00:00, 93.53it/s] 


In [36]:
features.head()

Unnamed: 0,image,dullness,whiteness,average_pixel_width,get_average_color
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,61.565,0.0,4.760995,"[97.46791666666667, 95.61545717592593, 137.73020833333334]"
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,84.27,0.0,2.172454,"[31.59418402777778, 33.46808449074074, 38.28046875]"
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,30.185,4.405,5.549972,"[137.6286199516099, 132.28333798622742, 157.06306997952726]"
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,0.0,60.445,3.159288,"[140.3214236111111, 124.87200954861112, 110.75948350694445]"
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,38.27,0.0,3.647377,"[37.204675925925926, 36.42141975308642, 68.42638117283951]"


In [38]:
features = features.rename(columns={'get_average_color': 'average_color'})

In [39]:
features['average_red'] = features['average_color'].apply(lambda x: x[0]) / 255
features['average_green'] = features['average_color'].apply(lambda x: x[1]) / 255
features['average_blue'] = features['average_color'].apply(lambda x: x[2]) / 255
features[['average_red', 'average_green', 'average_blue']].head(5)

Unnamed: 0,average_red,average_green,average_blue
0,0.382227,0.374963,0.540118
1,0.123899,0.131247,0.150119
2,0.53972,0.518758,0.615934
3,0.55028,0.489694,0.434351
4,0.145901,0.142829,0.268339


In [40]:
def getSize(filename):
    filename = images_path + filename
    st = os.stat(filename)
    return st.st_size

def getDimensions(filename):
    filename = images_path + filename
    img_size = IMG.open(filename).size
    return img_size 

In [41]:
def score_dimention(data):
    data['image_size'] = data['image'].progress_apply(getSize)
    data['temp_size'] = data['image'].progress_apply(getDimensions)
    data['width'] = data['temp_size'].progress_apply(lambda x : x[0])
    data['height'] = data['temp_size'].progress_apply(lambda x : x[1])
    return data

In [42]:
features = parallelize_dataframe(features, score_dimention)

100%|██████████| 69542/69542 [00:06<00:00, 10691.30it/s]
100%|██████████| 69542/69542 [00:06<00:00, 10960.33it/s]
100%|██████████| 69541/69541 [00:05<00:00, 11766.47it/s]
100%|██████████| 69541/69541 [00:05<00:00, 12550.76it/s]
100%|██████████| 69542/69542 [06:19<00:00, 183.10it/s]
100%|██████████| 69542/69542 [00:00<00:00, 586807.48it/s]
100%|██████████| 69542/69542 [00:00<00:00, 744545.81it/s]
100%|██████████| 69541/69541 [06:22<00:00, 182.02it/s]
100%|██████████| 69541/69541 [00:00<00:00, 807490.58it/s]
100%|██████████| 69541/69541 [00:00<00:00, 807083.92it/s]
100%|██████████| 69542/69542 [06:23<00:00, 181.11it/s]
100%|██████████| 69542/69542 [00:00<00:00, 746084.62it/s]
100%|██████████| 69542/69542 [00:00<00:00, 766538.83it/s]
100%|██████████| 69541/69541 [06:25<00:00, 180.38it/s]
100%|██████████| 69541/69541 [00:00<00:00, 937991.93it/s]
100%|██████████| 69541/69541 [00:00<00:00, 896885.38it/s]


In [43]:
features = features.drop(['temp_size', 'average_color'], axis=1)
features.head()

Unnamed: 0,image,dullness,whiteness,average_pixel_width,average_red,average_green,average_blue,image_size,width,height
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,61.565,0.0,4.760995,0.382227,0.374963,0.540118,31533,480,480
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,84.27,0.0,2.172454,0.123899,0.131247,0.150119,32325,360,360
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,30.185,4.405,5.549972,0.53972,0.518758,0.615934,35761,597,597
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,0.0,60.445,3.159288,0.55028,0.489694,0.434351,78887,640,640
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,38.27,0.0,3.647377,0.145901,0.142829,0.268339,36678,270,270


In [7]:
def get_blurrness_score(image):
    path =  images_path + image 
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    return fm
def score_blurrness(data):
    data['blurrness'] = data['image'].progress_apply(get_blurrness_score)
    return data

In [8]:
features = parallelize_dataframe(features, score_blurrness)

100%|██████████| 69542/69542 [14:49<00:00, 78.14it/s]  
100%|██████████| 69541/69541 [14:58<00:00, 77.36it/s] 
100%|██████████| 69542/69542 [14:59<00:00, 77.34it/s]
100%|██████████| 69541/69541 [15:04<00:00, 121.40it/s]


In [4]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 278166 entries, 0 to 278165
Data columns (total 10 columns):
image                  278166 non-null object
dullness               278166 non-null float64
whiteness              278166 non-null float64
average_pixel_width    278166 non-null float64
average_red            278166 non-null float64
average_green          278166 non-null float64
average_blue           278166 non-null float64
image_size             278166 non-null int64
width                  278166 non-null int64
height                 278166 non-null int64
dtypes: float64(6), int64(3), object(1)
memory usage: 23.3+ MB


In [3]:
features = pd.read_csv('train_jpg_0.csv',index_col=0)
features.head()

Unnamed: 0,image,dullness,whiteness,average_pixel_width,average_red,average_green,average_blue,image_size,width,height
0,845defffcb3876df3c272bbe807e21d6b028a67dafb68a4ef3e81b81a89164b9.jpg,61.565,0.0,4.760995,0.382227,0.374963,0.540118,31533,480,480
1,86c463c915b2a3474e909a619d9b09cfd3c8bb4c3790920d7722d8fb79e59495.jpg,84.27,0.0,2.172454,0.123899,0.131247,0.150119,32325,360,360
2,053126195bf9301ea10f0eb3b2c8f09c0f17c94aa99aac6484da49c0dd57eea0.jpg,30.185,4.405,5.549972,0.53972,0.518758,0.615934,35761,597,597
3,dd112eba060070f610d89677a8987ffa3e7f95cb301b04a0817297ef0213c406.jpg,0.0,60.445,3.159288,0.55028,0.489694,0.434351,78887,640,640
4,a49d4f24f0de7e542161a1932e50666fc40bddcdecc855b066a6ee34369e1926.jpg,38.27,0.0,3.647377,0.145901,0.142829,0.268339,36678,270,270


In [9]:
features.to_csv('train_jpg_0.csv',index=False)