# Preprocess an image and randomly select cropped images to test

*The first part of this notebook prepares images of vases located in the `data/test` folder for the image recognition process. The second part of this notebook selects the images that will be used for model performance demonstration.*

In [None]:
# import necessary libraries
import cv2
import glob
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
import csv
import shutil

### Part I: The following function chops up each image into multiple equally-sized, square pictures and resizes them to suit the trained image recognition model

*Since we don't know **if** there is a kithara in the image, nor if there is one **how big** it might be:*
* Identify the decile and quartile breakdowns of the proportion of image width and height that kitharai take up on the labeled dataset
* Based on a square box sized to the 25th, 50th, and 75th percentile of the `proportion of image width * total image width` and `proportion of image height * total image height` (see `demo-00_analysis-notes` notebook for percentile breakdowns), chop each image up starting from the top left (`multiply_image`) corner
* Shift the x and y start coordinates by 1/2 of the calculated image box size at a time

*To clean up the data generated:*
* Save cropped images where the x and y axis are the same length (i.e. remove 'remainder' images)
* Resize all of the square images to the same dimensions (specifically, upscale and downscale images to 558 pixels - the size of the training images)

In [None]:
# for each image in the folder
# get the height and width of each image
# iterate over the image and create equally-sized squares
# resize the images to the model input size
# write cropped images out to separate folders

def multiply_image(name):
    # read in the image
    img = cv2.imread("../data/test/{}.jpg".format(name))
    # get the size of the image
    y_max, x_max, _ = img.shape
    # calculate %ile sizes
    #y_size10 = int(y_max * 0.034375)
    y_size25 = int(y_max * 0.047212)
    y_size50 = int(y_max * 0.061202)
    y_size75 = int(y_max * 0.093640)
    #y_size90 = int(y_max * 0.13705848)
    #x_size10 = int(x_max * 0.02795573)
    x_size25 = int(x_max * 0.037402)
    x_size50 = int(x_max * 0.05514323)
    x_size75 = int(x_max * 0.081055)
    #x_size90 = int(x_max * 0.11984522)
    # set the image size based on the larger of x or y
    #size10 = np.where(x_size10 > y_size10, x_size10, y_size10).item(0)
    size25 = np.where(x_size25 > y_size25, x_size25, y_size25).item(0)
    size50 = np.where(x_size50 > y_size50, x_size50, y_size50).item(0)
    size75 = np.where(x_size75 > y_size75, x_size75, y_size75).item(0)
    #size90 = np.where(x_size90 > y_size90, x_size90, y_size90).item(0)
    # set variable with sizes
    sizes = [size25, size50, size75]
    for size in sizes:
        # set the increment to 1/2 the size of the larger of x or y length for the kithara
        inc = int(size/2)
        # incrementally move forward, starting at the top L corner, by 1/2 the largest dimension of the kithara size
        # get a bunch of square images that are the size of the kithara (or smaller, if running into an edge)
        for y in range(0, y_max, inc):
            if y + size <= y_max:
                start_y = y
                end_y = y + size
            else:
                start_y = y
                end_y = y_max            
            for x in range(0, x_max, inc):
                if x + size <= x_max:
                    start_x = x
                    end_x = x + size
                else:
                    start_x = x
                    end_x = x_max
                # create the cropped image based on the coordinates set above
                cropped_img = img[start_y:end_y, start_x:end_x]
                if (end_y - start_y) == (end_x - start_x):
                    # create a variable to tell the function how much to scale the original up/down to 558x558 pixesl
                    scale_percent = 558 / (cropped_img.shape[0])
                    # create new dimensions tuple
                    width = int(cropped_img.shape[1] * scale_percent)
                    height = int(cropped_img.shape[0] * scale_percent)
                    dim = (width, height)
                    # resize the image
                    cropped_img = cv2.resize(cropped_img, dim, interpolation = cv2.INTER_AREA)
                    # write the resized image out
                    cv2.imwrite("../data/test/processed/{}-{}-{}-{}.jpg".format(name, size, start_y, start_x), cropped_img)

#### Run the function over the image test data

In [None]:
# run the functions over the image folder    
for path in glob.glob("../data/test/*.jpg"):
    name = os.path.splitext(os.path.basename(path))[0]
    multiply_image(name)

### Part II: Randomly select 10% of the images for demo testing

In [None]:
# write out the list of files in the processed images dataset in the shell
# navigate to ../data/test/processed and 'ls' >> ../processed-image-list.csv
demo_ds = pd.read_csv('../data/test/processed-image-list.csv')

In [None]:
demo_ds

In [None]:
demo_ds1, demo_ds2 = tts(demo_ds, test_size = 0.1, random_state = 123)
demo_ds2.to_csv('../data/test/processed_images_20pct.csv', index = False)

In [None]:
# copy the demo images to a new folder
with open('../data/test/processed_images_20pct.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\test\\processed\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\test\\processed\\demo\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            'preprocessing complete'