# Define parameters for preprocessing new images

*This notebook includes a short analysis of the images used in the model-building process to determine the parameters for preprocessing new images for the image recognition process.*  

In [None]:
# import necessary libraries
import cv2
import glob
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# set display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Analysis of kithara sizes in proportion to image sizes

In [None]:
# read in the CSV with the image sizes and kithara sizes
kitharai = pd.read_csv('../data/kithara_listing.csv')
# show descriptive stats
kitharai.info()

In [None]:
# clean up rows that have a '?' instead of a number
kitharai = kitharai.drop(kitharai.index[345:349])
# check to make sure the data are cleaned
kitharai.loc[344:]

In [None]:
# convert the previously-questionable rows to numbers
kitharai['image_height'] = pd.to_numeric(kitharai['image_height'])
kitharai['image_width'] = pd.to_numeric(kitharai['image_width'])
# create new columns with the labeled kithara width and height proportions compared to the full image dimensions
kitharai['width_proportion'] = kitharai.kithara_width / kitharai.image_width
kitharai['height_proportion'] = kitharai.kithara_height / kitharai.image_height

#### Take a look at the distribution of sizes and proportions

In [None]:
kitharai.kithara_width.hist(bins = 20);

In [None]:
kitharai.kithara_height.hist(bins = 20);

In [None]:
kitharai.width_proportion.hist(bins = 10);

In [None]:
kitharai.height_proportion.hist(bins = 10);

*To normalize for the wide range of kithara and image sizes across the dataset, let's use the kithara:image width and height proportions.*

In [None]:
print(kitharai.height_proportion.describe())
print(kitharai.width_proportion.describe())

In [None]:
# identify the %s at ever 5%
_, hbins20 = pd.qcut(kitharai['height_proportion'], q = 20, retbins = True)
_, wbins20 = pd.qcut(kitharai['width_proportion'], q = 20, retbins = True)

In [None]:
print('height 5% bins:')
print(hbins20)
print('width 5% bins:')
print(wbins20)

#### Based on the quartile and decile breakdowns of kithara `width_proportion` and `height_proportion`, the %s used will be:
* 10th %ile h: 3.4%, w: 2.8%, average: 3.1%
* 25th %ile h: 4.7%, w: 3.7%, average: 4.2%
* 50th %ile h: 6.1%, w: 5.5%, average: 5.8%
* 75th %ile h: 9.4%, w: 8.1%, average: 8.8%
* 90th %ile h: 13.7%, w: 12%, average: 12.9%