In [9]:
from PIL import Image
from zipfile import ZipFile
import pandas as pd
import numpy as np
import time
from skimage import feature
from zipfile import ZipFile

import multiprocessing
from multiprocessing import Pool

from dask import bag, threaded
from dask.diagnostics import ProgressBar

In [10]:
num_partitions = 2
num_cores = multiprocessing.cpu_count()

def parallelize_dataframe(df, func):
    a,b = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, [a,b]))
    pool.close()
    pool.join()
    return df

In [None]:
def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

def perform_color_analysis(path, flag):
    with ZipFile(zip_path) as myzip:
        with myzip.open(path) as myfile:
            im = Image.open(myfile) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        return None

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    if flag == 'black':
        return dark_percent
    elif flag == 'white':
        return light_percent
    else:
        return None

In [None]:
def score_dullness(data):
    dullness = []
    counter = 0
    t0= time.time()
    for x in data['image_path']:
        if counter%10. == 0.:
            print("Image %d of %d" % (counter, data.shape[0]))
            print("\nImage Runtime: %0.2f Minutes"%((time.time() - t0)/60))
        percent = perform_color_analysis(x, 'black')
        dullness.append(percent)
        counter = counter + 1
    data['dullness'] = dullness
    return data

zip_path = '/home/g492652607/data/train_jpg_0.zip'
with ZipFile(zip_path) as myzip:
    files_in_zip = myzip.namelist()
features = pd.DataFrame()
features['image_path'] = files_in_zip[1:]
features['image'] = features['image_path'].apply(lambda x: x.split('/')[-1].split('.')[0])

features = parallelize_dataframe(features, score_dullness)
features.head()

Image 0 of 139084


Image Runtime: 0.00 MinutesImage 0 of 139083

Image Runtime: 0.00 Minutes
Image 10 of 139084

Image Runtime: 0.83 Minutes
Image 10 of 139083

Image Runtime: 0.83 Minutes
Image 20 of 139084

Image Runtime: 1.64 Minutes
Image 20 of 139083

Image Runtime: 1.64 Minutes
Image 30 of 139084

Image Runtime: 2.47 Minutes
Image 30 of 139083

Image Runtime: 2.49 Minutes
