# Data preprocessing

## Imports and helper functions

In [23]:
import imgaug as ia
import imgaug.augmenters as iaa
import skimage.io as io
import os
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import skimage.io as ioimage_stack
import glob
import uuid

from skimage import data, color
from skimage.transform import rescale, resize, downscale_local_mean
from PIL import Image
from tqdm import tqdm


AUTOTUNE = tf.data.experimental.AUTOTUNE
data_dir = "E:/Datasets/coral-classifier/few-v-rest"
augmented_data_dir = "E:/Datasets/coral-classifier/augmented_dataset-v2"
CLASS_NAMES = os.listdir(data_dir)
dirs = [data_dir + "/" + c for c in CLASS_NAMES ]

BATCH_SIZE = 32
IMG_HEIGHT = 224
IMG_WIDTH = 224


def to_jpg_from(dateset_dir, classes, from_extension="jfif"):
    for cl in classes:
        for file in tqdm(glob.glob(f"{dateset_dir}/{cl}/*.{from_extension}")):
            im = Image.open(file)
            rgb_im = im.convert('RGB')
            rgb_im.save(file.replace("."+from_extension, "_" + str(uuid.uuid4()) + ".jpg"), quality=100)


def rename_files(dateset_dir, classes, extension="jpg"):
    for cl in classes:
        for i, file in tqdm(enumerate(glob.glob(f"{dateset_dir}/{cl}/*.{extension}"))):
            num = str(i).rjust(6, "0")
            cl_name = cl.replace(" ", "")
            os.rename(file, f"{dateset_dir}/{cl}/{cl_name}_{num}.{extension}")
            
    
def show_batch(image_batch, label_batch):
    plt.figure(figsize=(15,15))
    for n in range(25):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch[n])
        plt.title(CLASS_NAMES[label_batch[n]==1][0].title())
        plt.axis('off')
        
        
def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    return parts[-2] == CLASS_NAMES


def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    return tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT])


def process_path(file_path):
    label = get_label(file_path)
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label


def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)
    # Repeat forever
    ds = ds.repeat()
    ds = ds.batch(BATCH_SIZE)
    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

## Unify extensions

In [24]:
to_jpg_from(data_dir, CLASS_NAMES, from_extension="png")

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]

processing class 'lps acanthastrea'
processing class 'lps chalice'
processing class 'lps euphyllia'
processing class 'other'
processing class 'sps acropora'
processing class 'sps montipora'
processing class 'zoa'





## Unify filenames

In [20]:
rename_files(data_dir, CLASS_NAMES)

2061it [00:00, 4700.00it/s]
2353it [00:00, 4701.06it/s]
1910it [00:00, 4610.14it/s]
3735it [00:00, 5117.44it/s]
6865it [00:01, 3974.72it/s]
2173it [00:01, 1685.59it/s]
2542it [00:00, 5738.15it/s]


## Train, Test and Validation split

In [47]:
class_images = {}

for cl, data_path in zip(CLASS_NAMES, dirs):
    class_images[cl] = np.array(io.collection.glob(data_path + "/*.jpg"))

counts = [len(value) for key, value in class_images.items()]
min_count = min(counts)

from sklearn.model_selection import train_test_split
sets = {}
for key, value in class_images.items():
    X = value
    y = [key for item in value]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1755, random_state=1)
        
    sets[key + "_train"] = X_train
    sets[key + "_test"] = X_test
    sets[key + "_val"] = X_val

In [48]:
for key, value in sets.items():
    print(key, "->", len(value))

lps acanthastrea_train -> 1443
lps acanthastrea_test -> 310
lps acanthastrea_val -> 308
lps chalice_train -> 1649
lps chalice_test -> 353
lps chalice_val -> 351
lps euphyllia_train -> 1338
lps euphyllia_test -> 287
lps euphyllia_val -> 285
other_train -> 2616
other_test -> 561
other_val -> 558
sps acropora_train -> 4810
sps acropora_test -> 1030
sps acropora_val -> 1025
sps montipora_train -> 1522
sps montipora_test -> 326
sps montipora_val -> 325
zoa_train -> 1780
zoa_test -> 382
zoa_val -> 380


## Move files to train, test and val dirs

In [53]:
output_dir = "E:/Datasets/coral-classifier/train_ready_dataset"

set_folders = ["train", "test", "val"]

# make sure that dirs exist

try:
    os.mkdir(output_dir)
except Exception:
    print(f"'{output_dir}' already exists")

for set_folder in set_folders:
    p = f"{output_dir}/{set_folder}"
    try:
        os.mkdir(p)
    except Exception:
        print(f"'{p}' already exists")
    for cl in CLASS_NAMES:
        p = f"{output_dir}/{set_folder}/{cl}"
        try:
            os.mkdir(p)
        except Exception:
            print(f"'{p}' already exists")

'E:/Datasets/coral-classifier/train_ready_dataset' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/lps acanthastrea' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/lps chalice' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/lps euphyllia' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/other' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/sps acropora' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/sps montipora' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/train/zoa' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/test' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/test/lps acanthastrea' already exists
'E:/Datasets/coral-classifier/train_ready_dataset/test/lps chalice' already exists
'E:/Datasets/coral-classifier/train_

In [60]:
from shutil import copy
dest_path_base = "E:/Datasets/coral-classifier/train_ready_dataset"


for key, value in sets.items():
    splitted = key.split("_")
    cl = splitted[0]
    set_folder = splitted[1]
    
    for file_path in tqdm(value):
        file_name = file_path.split("\\")[-1]
        copy(file_path, f"{dest_path_base}/{set_folder}/{cl}")

100%|████████████████████████████████████████████████████████████████████████████| 1443/1443 [00:00<00:00, 1779.19it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 310/310 [00:02<00:00, 111.38it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 308/308 [00:02<00:00, 106.97it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1649/1649 [00:26<00:00, 62.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 353/353 [00:05<00:00, 67.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 351/351 [00:05<00:00, 63.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1338/1338 [00:21<00:00, 62.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 287/287 [00:04<00:00, 66.47it/s]
100%|███████████████████████████████████

## Data augmentation