# Pré-processamento das Imagens


# Imports

In [1]:
from collections import defaultdict
from scipy.stats import itemfreq
from skimage import feature
from PIL import Image as IMG
import numpy as np
import pandas as pd 
import operator
import cv2
import os
import time
import re
import multiprocessing
from threading import Thread

#Obtém a lista de arquivos de imagens no diretório imagens
images_path = './images/'
imgs = os.listdir(images_path)
imgs_names = list(filter(lambda image_name: image_name.endswith('jpg') ,imgs))

In [2]:
num_cores = multiprocessing.cpu_count()
imgs_names_splitted = np.array_split(imgs_names, num_cores)
num_partitions = num_cores

# Busca por imagens corrompidas
# Há cerca de 12 imagens corrompidas nos dados do kaggle
def search_corrupted_images(id_thread):

    imgs_names_th = imgs_names_splitted[id_thread]
    num_imgs_th   = len(imgs_names_th)

    start_point = 0
    for i in range(num_partitions):
        if(i<id_thread):
            start_point += len(imgs_names_splitted[i])

    for i in range(num_imgs_th):
        try:
            IMG.open(images_path+imgs_names_th[i])
        except Exception as e:
            print(e)
            print(imgs_names[start_point+i] + ' removed from processing\n')
            imgs_names[start_point+i] = ""

# Paralelizando busca de imagens corrompidas
start = time.time()
threads = [Thread(target=search_corrupted_images,args=(id_thread,)) for id_thread in range(num_cores)]
[thread.start() for thread in threads]
[thread.join() for thread in threads]   
end = time.time()

imgs_names = list(filter(lambda image_name: image_name != '' ,imgs_names))

print('\ntime looking for corrupted images: '+str("%.2f" % (end - start))+'s\n')

#update imgs_names_splitted
imgs_names_splitted = np.array_split(imgs_names, num_cores)

cannot identify image file './images/4f029e2a00e892aa2cac27d98b52ef8b13d91471f613c8d3c38e3f29d4da0b0c.jpg'
4f029e2a00e892aa2cac27d98b52ef8b13d91471f613c8d3c38e3f29d4da0b0c.jpg removed from processing

cannot identify image file './images/8513a91e55670c709069b5f85e12a59095b802877715903abef16b7a6f306e58.jpg'
8513a91e55670c709069b5f85e12a59095b802877715903abef16b7a6f306e58.jpg removed from processing


time looking for corrupted images: 0.08s



# Criando estruturas de dados para armazenar resultado do processamento de imagens

In [3]:
num_imgs = len(imgs_names)

feature_dullness            = np.zeros(num_imgs)
feature_whiteness           = np.zeros(num_imgs)
feature_average_pixel_width = np.zeros(num_imgs)
feature_dominant_red        = np.zeros(num_imgs)
feature_dominant_green      = np.zeros(num_imgs)
feature_dominant_blue       = np.zeros(num_imgs)
feature_average_red         = np.zeros(num_imgs)
feature_average_green       = np.zeros(num_imgs)
feature_average_blue        = np.zeros(num_imgs)
feature_width               = np.zeros(num_imgs)
feature_height              = np.zeros(num_imgs)
feature_size                = np.zeros(num_imgs)
feature_blurrness_score     = np.zeros(num_imgs)

# Métodos para processamento 
Métodos para calcular features das imagens fornecidas pelo usuário sban. 
Foram produzidas versões otimizadas para estes, tornando o processamento mais rápido
Ref.: https://www.kaggle.com/shivamb/ideas-for-image-features-and-image-quality

In [1]:
def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

In [5]:
def perform_color_analysis(img):
    #img = img.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = img.size
    halves = (size[0]/2, size[1]/2)
    im1 = img.crop((0, 0, size[0], halves[1]))
    im2 = img.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        return None

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent  = (dark_percent1 + dark_percent2)/2 
    
    return dark_percent , light_percent

In [6]:
def average_pixel_width(img):    
    im_array = np.asarray(img.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (img.size[0]*img.size[1]))
    return apw*100

In [7]:
def get_dominant_color(img):
    arr = np.float32(img)
    pixels = arr.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(img.shape)

    dominant_color = palette[np.argmax(itemfreq(labels)[:, -1])]
    return dominant_color


In [8]:
def get_average_color(img):
    average_color = [img[:, :, i].mean() for i in range(img.shape[-1])]
    return average_color

In [9]:
def getSize(file_path):
    st = os.stat(file_path)
    return st.st_size

def getDimensions(img):
    img_size = img.size
    return img_size 

In [10]:
def get_blurrness_score(img):
    image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    return fm

In [11]:
# Método que gera e salva features para as imagens
def compute(id_thread):
    imgs_names_th = imgs_names_splitted[id_thread]
    num_imgs_th   = len(imgs_names_th)
    
    print ('thread '+str(id_thread)+' started')
    
    start_point = 0
    for i in range(num_partitions):
        if(i<id_thread):
            start_point += len(imgs_names_splitted[i])
    
    for i in range(num_imgs_th):
        
        if(id_thread == 0 and (i*100)/num_imgs_th % 10 == 0):
            print('Progress: '+str((i*100)/num_imgs_th)+'%\n')
        
        img_path = images_path+imgs_names_th[i]
        img      = IMG.open(img_path)
        img_cv2  = cv2.imread(img_path)
        j = start_point+i
        
        feature_size[j] = getSize(img_path)
        
        dimensions = getDimensions(img)
        feature_width[j]  = dimensions[0]
        feature_height[j] = dimensions[1]
        
        dark_percent, light_percent = perform_color_analysis(img)
        feature_dullness [j]          = dark_percent
        feature_whiteness[j]          = light_percent
        
        
        feature_average_pixel_width[j] = average_pixel_width(img)
        
        img = None
        
        feature_blurrness_score[j] = get_blurrness_score(img_cv2)
        
        # Cor dominante removida da análise, pois tem alto custo de processamento
        '''
        dominant_color = get_dominant_color(img_cv2)
        feature_dominant_red[j]   = dominant_color[0]/255
        feature_dominant_green[j] = dominant_color[1]/255
        feature_dominant_blue[j]  = dominant_color[2]/255
        '''
        
        average_color = get_average_color(img_cv2)
        feature_average_red[j]   = average_color[0]/255
        feature_average_green[j] = average_color[1]/255
        feature_average_blue[j]  = average_color[2]/255
        
    print ('thread '+str(id_thread)+' finished')


# Processando imagens paralelamente

In [12]:
# Paralelizando o processamento de imagens
start = time.time()
threads = [Thread(target=compute,args=(id_thread,)) for id_thread in range(num_cores)]
[thread.start() for thread in threads]
[thread.join() for thread in threads]
end = time.time()


#benchmark
print('\ntime processing parallel: '+str("%.2f" % (end - start))+'s\n')

start = time.time()

for i in range(num_cores):
    compute(i)
end = time.time()

print('\ntime processing serial: '+str("%.2f" % (end - start))+'s')


thread 0 started
Progress: 0.0%

thread 1 started
thread 2 started
thread 3 started
Progress: 20.0%

Progress: 40.0%

Progress: 60.0%

Progress: 80.0%

thread 3 finished
thread 0 finished
thread 1 finished
thread 2 finished

time processing parallel: 2.11s

thread 0 started
Progress: 0.0%

Progress: 20.0%

Progress: 40.0%

Progress: 60.0%

Progress: 80.0%

thread 0 finished
thread 1 started
thread 1 finished
thread 2 started
thread 2 finished
thread 3 started
thread 3 finished

time processing serial: 2.56s


# Salvando os resultados

In [14]:
# save to file

#remove .jpg suffix
imgs_names = list(map(lambda image_name:  re.sub('\.jpg$', '', image_name),imgs_names))

data = {
     'id'    : imgs_names,
     'width' : feature_width,
     'height': feature_height,
     'size'  : feature_size,
     'dullness' : feature_dullness, 
     'whiteness': feature_whiteness,
     #'dominant_red'  : feature_dominant_red,
     #'dominant_green': feature_dominant_green,
     #'dominant_blue' : feature_dominant_blue,
     'average_red'   : feature_average_red,
     'average_green' : feature_average_green,
     'average_blue'  : feature_average_blue,
     'average_pixel_width' : feature_average_pixel_width,
     'blurrness_score'     : feature_blurrness_score 
    }

columns = [
            'id', 
            'width', 
            'height',
            'size', 
            'dullness', 
            'whiteness',
            #'dominant_red',
            #'dominant_green',
            #'dominant_blue',
            'average_red', 
            'average_green',
            'average_blue',
            'average_pixel_width',
            'blurrness_score'
                  ]


df = pd.DataFrame(data=data,columns=columns)

df.to_csv('train_jpg.csv', encoding='utf-8', index=False)