# Preprocessing Phase #2 - Equalization of Image Sizes

### CTCB (Classification of Toxigenic CyanoBacterial genera) project

* ***Author : Iman Kianian***
* Paper Link : 

--------------------

#### Import the necessary libraries

In [30]:
import os
import glob
import cv2
import numpy as np
from PIL import Image
import uuid

---------------

## Defining 2 functions:

In [31]:
def calculatebackground(img):
    """
        this function estimates the background color of an image.
        
        img (nd.array): an 2d numpy array (an image)
    """
    args_r = (img[:,0][:,0] ,img[:,-1][:,0],img[0,:][:,0] , img[-1,:][:,0] ) # collecting R channel of all surrounding pixels of the image
    args_g = (img[:,0][:,1] ,img[:,-1][:,1],img[0,:][:,1] , img[-1,:][:,1] ) # collecting G channel of all surrounding pixels of the image
    args_b = (img[:,0][:,2] ,img[:,-1][:,2],img[0,:][:,2] , img[-1,:][:,2] ) # collecting B channel of all surrounding pixels of the image
    
    args_r = np.concatenate(args_r)
    args_g = np.concatenate(args_g)
    args_b = np.concatenate(args_b)

    r = np.mean(args_r,axis=None)  # calculating mean on all R channels of all surrounding pixels of the image
    g = np.mean(args_g,axis=None)  # calculating mean on all G channels of all surrounding pixels of the image
    b = np.mean(args_b,axis=None)  # calculating mean on all B channels of all surrounding pixels of the image
    return int(np.round(r)),int(np.round(g)),int(np.round(b))

The background color from images will be inferred using the mean of the surrounding pixels in this function. 

In [32]:
def expand2square(pil_img, background_color,target_size=150):
    """
        this function convert a image to its square size of size = target_size
        
        pil_img (nd.array): an 2d numpy array (an image)
        background_color (tuple): a tuple of 3 components for (R,G,B) channels
        target_size (int): result image will be of size target_size x target_size
    """
    pil_img = Image.fromarray(pil_img)
    width, height = pil_img.size
    if width == height and width == target_size:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        height = width
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        width = height
        
    if width>target_size:
        dim = (target_size,target_size)
        resized = cv2.resize(np.array(result), dim, interpolation = cv2.INTER_AREA)
        return resized
    elif width<target_size:
        resized = Image.new(result.mode, (target_size, target_size), background_color)
        resized.paste(result, ((target_size - width) // 2, (target_size - height) // 2))
        return resized
    else:
        return result
        

In [33]:
def expand2square(pil_img, background_color,target_size=150):
    """
        this function convert a image to its square size of size = target_size
        
        pil_img (nd.array): an 2d numpy array (an image)
        background_color (tuple): a tuple of 3 components for (R,G,B) channels
        target_size (int): result image will be of size target_size x target_size
    """
    pil_img = Image.fromarray(pil_img)
    width, height = pil_img.size
    if width == height and width == target_size:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        height = width
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        width = height
        
    if width>target_size:
        dim = (target_size,target_size)
        resized = cv2.resize(np.array(result), dim, interpolation = cv2.INTER_AREA)
        return resized
    elif width<target_size:
        resized = Image.new(result.mode, (target_size, target_size), background_color)
        resized.paste(result, ((target_size - width) // 2, (target_size - height) // 2))
        return resized
    else:
        return result
        

This function converts images to a specific size regardless of their initial size. If the initial size is small or large, we use upscaling or downscaling. we fill the empty pixels with a color that we derive from each image as its background.

# Load Dataset

#### *Training Dataset*

In [34]:
Categories = list(os.listdir(r"G:\data_pic\rapho"))
Categories

['trainx']

In [35]:
print(f'There are {len(Categories)} categories of oxigenic cyanobacterial genera.')

There are 1 categories of oxigenic cyanobacterial genera.


In [36]:
images = {}
images_directory = {}
for cls in Categories:
    directory = r"G:\data_pic\rapho/"+cls
    if os.path.isdir(directory):
        imags = []
        imgs_directory = []
        for i, filename in enumerate(os.listdir(directory)):
            image_format = filename.split('.')[-1]  # get image format (eg. png or jpg, etc.)
            if image_format in ('jpg','png'):
                addr = directory+"/"+filename
                image = cv2.imread(addr)
                imags.append(image)
                imgs_directory.append(addr)
        images[cls] = imags
        images_directory[cls] = imgs_directory

In [37]:
images_directory

{'trainx': ['G:\\data_pic\\rapho/trainx/raphidiopsis-549.jpg',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011118.png',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011137.png',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011151.png',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011209.png',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011232.png',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011250.png',
  'G:\\data_pic\\rapho/trainx/Screenshot 2024-08-10 011311.png']}

In [38]:
images

{'trainx': [array([[[203, 203, 203],
          [203, 203, 203],
          [203, 203, 203],
          ...,
          [203, 203, 203],
          [203, 203, 203],
          [203, 203, 203]],
  
         [[203, 203, 203],
          [203, 203, 203],
          [203, 203, 203],
          ...,
          [203, 203, 203],
          [203, 203, 203],
          [203, 203, 203]],
  
         [[203, 203, 203],
          [203, 203, 203],
          [203, 203, 203],
          ...,
          [203, 203, 203],
          [203, 203, 203],
          [203, 203, 203]],
  
         ...,
  
         [[209, 204, 205],
          [209, 204, 205],
          [209, 204, 205],
          ...,
          [206, 201, 202],
          [206, 201, 202],
          [206, 201, 202]],
  
         [[209, 204, 205],
          [209, 204, 205],
          [209, 204, 205],
          ...,
          [206, 201, 202],
          [206, 201, 202],
          [206, 201, 202]],
  
         [[209, 204, 205],
          [209, 204, 205],
          [209

In [39]:
countofallimages = 0
for cls in Categories:
  print(f'Number of Images in {cls} class is = {len(images[cls])}')
  countofallimages += len(images[cls])

Number of Images in trainx class is = 8


In [41]:
import cv2
import numpy as np

X = []       # X stores images of train dataset
y = []       # y stores labels of each image 
X_addr = []  # X_addr stores path of each image

desired_width = 150  # Set your desired width for the images
desired_height = 150 # Set your desired height for the images

# Resize images and populate X, y
for cls in Categories:
    for img in images[cls]:
        # Resize the image to a uniform shape
        resized_img = cv2.resize(img, (desired_width, desired_height))
        X.append(resized_img)
        y.append(cls)

# Populate X_addr
for cls in Categories:
    for img_path in images_directory[cls]:
        X_addr.append(img_path)

# Convert lists to numpy arrays
y = np.array(y)
X = np.array(X)  # All images should now have the same shape
X_addr = np.array(X_addr)



In [44]:
import os
import cv2
import numpy as np

# Assuming X, y, and X_addr are already populated

for i, img in enumerate(X):
    # Create the directory if it doesn't exist
    directory = f'G:\data_pic/rapho/Trainyy/{y[i]}'
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
    
    # Extract the filename and ensure it has an extension
    filename = os.path.basename(X_addr[i])  # Extracts just the filename with extension
    address = os.path.join(directory, filename)  # Combine directory and filename

    # Process the image
    new_X = expand2square(img, calculatebackground(img))
    
    # Save the image and handle any errors
    success = cv2.imwrite(address, np.array(new_X))
    if success:
        print(f"Saved: {address}")
    else:
        print(f"Failed to save: {address}")


Saved: G:\data_pic/rapho/Trainyy/trainx\raphidiopsis-549.jpg
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011118.png
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011137.png
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011151.png
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011209.png
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011232.png
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011250.png
Saved: G:\data_pic/rapho/Trainyy/trainx\Screenshot 2024-08-10 011311.png


------------------

#### *Test Dataset*

In [12]:
images = {}
images_directory = {}
for cls in Categories:
    directory = "../dataset/Test/"+cls
    if os.path.isdir(directory):
        imags = []
        imgs_directory = []
        for i, filename in enumerate(os.listdir(directory)):
            image_format = filename.split('.')[-1]  # get image format (eg. png or jpg, etc.)
            if image_format in ('jpg','png'):
                addr = directory+"/"+filename
                image = cv2.imread(addr)
                imags.append(image)
                imgs_directory.append(addr)
        images[cls] = imags
        images_directory[cls] = imgs_directory

In [13]:
countofallimages = 0
for cls in Categories:
  print(f'Number of Images in {cls} class is = {len(images[cls])}')
  countofallimages += len(images[cls])

Number of Images in Anabaena class is = 57
Number of Images in Aphanizomenon class is = 4
Number of Images in Cylindrospermopsis class is = 3
Number of Images in Dolichospermum class is = 33
Number of Images in Microcystis class is = 235
Number of Images in Nostoc class is = 73
Number of Images in Oscillatoria class is = 32
Number of Images in Phormidium class is = 61
Number of Images in Planktothrix class is = 19
Number of Images in Raphidiopsis class is = 2


In [14]:
X = []   # X stores images of test dataset
y = []   # y stores labels of each image 
X_addr = []  # X_addr stores path of each image
for cls in Categories:
  for img in images[cls]:
    X.append(img)
    y.append(cls)
for cls in Categories:
  for img in images_directory[cls]:
    X_addr.append(img)
y = np.array(y)
X = np.array(X)
X_addr = np.array(X_addr)

  X = np.array(X)


In [15]:
for i , img in enumerate(X):
    try:
        os.mkdir(f'../dataset-processed/Test/{y[i]}')
    except:
        pass
    address = f'../dataset-processed/Test/{y[i]}/{X_addr[i][17:].split("/")[1]}'
    new_X = expand2square(img,calculatebackground(img))
    cv2.imwrite(address, np.array(new_X))

-----------