In [1]:
from PIL import Image
import os
import pandas as pd
import math
import warnings

In [2]:
# Create function that returns the RGB values of the center pixel of an image
def get_RGB(image_path):
    im = Image.open(image_path)
    img = im.load()
    r, g, b = img[im.size[0]//2, im.size[1]//2]
    return r, g, b

In [3]:
# Create function that counts the number of white pixels in an image
def count_white_pixels(image_path):
    img = Image.open(image_path)
    width, height = img.size
    white_pixel_count = 0

    for x in range(width):
        for y in range(height):
            r, g, b = img.getpixel((x, y))
            if r == 255 and g == 255 and b == 255:
                white_pixel_count += 1
    return white_pixel_count

In [4]:
# Create function to generate tabular dataset with relevant numerical values relating to the images
def generate_dataset(class_ids, folder_path):
    df = pd.DataFrame(columns=['total_pixel_count', 'color_pixel_count', 'white_pixel_count', 'color_ratio', 'r', 'g', 'b']) 
    for class_id in class_ids:
        source_subdir = os.path.join(folder_path, class_id)
        for filename in os.listdir(source_subdir):
            file_path = os.path.join(source_subdir, filename)
            img = Image.open(file_path)
            
            width = img.size[0]
            total_pixel_count = width*width
            white_pixel_count = count_white_pixels(file_path)
            color_pixel_count = total_pixel_count - white_pixel_count
            color_width = math.sqrt(color_pixel_count)
            color_ratio = color_pixel_count/total_pixel_count
            r, g, b = get_RGB(file_path)
            
            df = df.append({'width': width,
                            'total_pixel_count': total_pixel_count,
                            'white_pixel_count': white_pixel_count,
                            'color_pixel_count': color_pixel_count,
                            'color_width': color_width,
                            'color_ratio': color_ratio,
                            'r': r,
                            'g': g,
                            'b': b,
                            'class_id': class_id},
                            ignore_index=True)
            
    return df

In [5]:
# Specify relevant paths for data
SOURCE = 'squares'
TRAIN = os.path.join(SOURCE, "train")
TEST = os.path.join(SOURCE, "val")
class_ids = sorted(os.listdir(TRAIN))
print(class_ids)

['a', 'b', 'c']


In [6]:
# Suppress pandas warnings about the append function because they're annoying :P
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
# Create tabular dataset from the training image set
training_df = generate_dataset(class_ids, TRAIN)

In [8]:
# Export tabular training dataset to CSV
training_df.to_csv('training_data.csv', index=False)

In [9]:
# Create tabular dataset from the test image set
test_df = generate_dataset(class_ids, TEST)

In [10]:
# Export tabular test dataset to CSV
test_df.to_csv('test_data.csv', index=False)