# Data preprocessing

This notebook will cover the steps used to generate the image dataset. It uses a preexisting dataset containing images of stumps and a corresponding mask labelling areas designated as background (black), stump (white) and RBR (red). This process assumes that the masks share the same name with the image its associated with.

The following libraries are used.

In [None]:
# File manipulation
import os

# Data manipulation
import numpy as np
import pandas as pd

# Image manipulation
import cv2

## Image cropping

Define a function to return the bounding box from a mask.

In [None]:
def bbox(mask):
    b = np.where(mask != 0)
    box = np.min(b[0]), np.max(b[0]), np.min(b[1]), np.max(b[1])
    return  box

Crop the images and masks

In [None]:
# Get a list of the images and masks
img_list = os.listdir('data/orig/')
msk_list = os.listdir('data/orig_mask/')

for idx, mask_name in enumerate(msk_list):
    # Read in mask and image as Numpy array
    msk = np.array(Image.open('data/orig_mask/' + mask_name))
    img = np.array(Image.open('data/orig/' + mask_name))
    
    # Get stump boundaries from mask
    bbox = bbox1(msk)
    
    # Crop mask and image
    msk_crp = Image.fromarray(msk[bbox[0]:bbox[1], bbox[2]:bbox[3]])
    img_crp = Image.fromarray(img[bbox[0]:bbox[1], bbox[2]:bbox[3]])
    
    # Save cropped mask and image to file
    msk_crp.save('data/crop_mask/' + str(idx).zfill(4) + '.jpg')
    img_crp.save('data/crop/' + str(idx).zfill(4) + '.jpg')

## Apply histogram equalisation

Create a new image dataset which has been normalised using histogram equalisation.

In [None]:
crp_img_list = os.listdir('data/crop/')

# Iterate through image list applying histogram equalisation
for image in crp_img_list:
    img = cv2.imread('data/crop/' + image)
    
    # Convert to yuv format
    img_yuv = cv2.cvtColor(img, cv2.COLOR_BGR2YUV)
    
    # Equalise the histogram of the y channel
    img_yuv[:,:,0] = cv2.equalizeHist(img_yuv[:,:,0])
    
    # Convert back to RGB
    img_rgb = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR)
    
    # Save histogram equalised image
    cv2.imwrite('data/equ_crop/' + image, img_rgb)

## Target extraction

Extract the target values - presence of RBR and %RBR - from the cropped mask.

In [None]:
crp_msk_list = [s.lower() for s in os.listdir('data/crop_mask/')]

# Create a dataframe to store the results
columns = ['rbr', 'percentage']
df = pd.DataFrame(index=crp_msk_list, columns=columns)
df.index.name = 'filename'
df = df.sort_index()

# Iterate though the masks list extracting %RBR
for idx, mask in enumerate(crp_msk_list):
    msk = cv2.imread('data/crop_mask/' + mask)
    
    # Threshold the mask to remove aliasing
    _, thresh = cv2.threshold(msk, 100, 200, cv2.THRESH_BINARY)
    
    # Count the number of stump (white) and RBR (red) pixels
    clear = np.sum(cv2.inRange(thresh, (200,200,200), (255,255,255)))/255
    rbr = np.sum(cv2.inRange(thresh, (0,0,200), (20,20,255)))/255
    
    # Calculate %RBR as a percentage of the area
    df.loc[mask, 'percentage'] = np.round(abs(rbr/(rbr + clear)), 4)

# Use %RBR to establish the presence of RBR
df['rbr'] = df['percentage'] != 0

# Save to file
df.to_csv('dataframes/dataset.csv')