In [24]:
from PIL import Image
import os
import pandas as pd
import math
import warnings

In [25]:
# Create function that returns the RGB values of the center pixel of an image
def get_RGB(image_path):
    im = Image.open(image_path)
    img = im.load()
    r, g, b = img[im.size[0]//2, im.size[1]//2]
    return r, g, b

In [26]:
# Create function that counts the number of white pixels in an image
def count_white_pixels(image_path):
    img = Image.open(image_path)
    width, height = img.size
    white_pixel_count = 0
    
    for x in range(width):
        for y in range(height):
            r, g, b = img.getpixel((x, y))
            if r == 255 and g == 255 and b == 255:
                white_pixel_count += 1
    return white_pixel_count

In [27]:
# Create function that counts the number of color pixels in an image
def count_color_pixels(image_path):
    img = Image.open(image_path)
    width, height = img.size
    color_pixel_count = 0
    mid_r, mid_g, mid_b = get_RGB(image_path)

    for x in range(width):
        for y in range(height):
            r, g, b = img.getpixel((x, y))
            if r == mid_r and g == mid_g and b == mid_b:
                color_pixel_count += 1
    return color_pixel_count

In [28]:
# Create function that counts the number of white and color pixels in an image
def count_pixels(image_path):
    img = Image.open(image_path)
    width, height = img.size
    white_pixel_count = 0
    color_pixel_count = 0
    mid_r, mid_g, mid_b = get_RGB(image_path)
    avg_mid_rgb = (mid_r + mid_g + mid_b)/3

    for x in range(width):
        for y in range(height):
            r, g, b = img.getpixel((x, y))
            avg_rgb = (r + g + b)/3
            diff_mid_rgb = abs(avg_rgb - avg_mid_rgb)
            diff_white =  abs(avg_rgb - 255)
            # Account for JPEG compression by checking if pixel color is closer to white or to the center pixel color
            if (r == 255 and g == 255 and b == 255) or diff_white < diff_mid_rgb:
                white_pixel_count += 1
            else:
                color_pixel_count += 1

    return white_pixel_count, color_pixel_count

In [29]:
# Create function to generate tabular dataset with relevant numerical values relating to the images
def generate_dataset(class_ids, folder_path):
    df = pd.DataFrame(columns=['total_pixel_count', 'color_pixel_count', 'white_pixel_count', 'color_ratio', 'r', 'g', 'b']) 
    for class_id in class_ids:
        source_subdir = os.path.join(folder_path, class_id)
        for filename in os.listdir(source_subdir):
            file_path = os.path.join(source_subdir, filename)
            img = Image.open(file_path)
            
            width = img.size[0]
            total_pixel_count = width*width
            white_pixel_count, color_pixel_count = count_pixels(file_path)
            color_width = math.sqrt(color_pixel_count)
            color_ratio = color_pixel_count/total_pixel_count
            r, g, b = get_RGB(file_path)
            
            df = df.append({'width': width,
                            'total_pixel_count': total_pixel_count,
                            'white_pixel_count': white_pixel_count,
                            'color_pixel_count': color_pixel_count,
                            'color_width': color_width,
                            'color_ratio': color_ratio,
                            'r': r,
                            'g': g,
                            'b': b,
                            'class_id': class_id},
                            ignore_index=True)
            
    return df

In [30]:
# Specify relevant paths for data
SOURCE = 'squares'
TRAIN = os.path.join(SOURCE, "train")
TEST = os.path.join(SOURCE, "val")
class_ids = sorted(os.listdir(TRAIN))
print(class_ids)

['a', 'b', 'c']


In [31]:
# Suppress pandas warnings about the append function because they're annoying :P
warnings.simplefilter(action='ignore', category=FutureWarning)

In [32]:
# Create tabular dataset from the training image set
training_df = generate_dataset(class_ids, TRAIN)

In [34]:
# Export tabular training dataset to CSV
training_df.to_csv('training_data.csv', index=False)

In [35]:
# Create tabular dataset from the test image set
test_df = generate_dataset(class_ids, TEST)

In [36]:
# Export tabular test dataset to CSV
test_df.to_csv('test_data.csv', index=False)