### Data processing

[Image data munging](#a) | [Image preprocessing](#b) 

In [22]:
# standard imports
import os, glob, fnmatch
import pandas as pd
import numpy as np

# image processing imports
import cv2
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

---
## <a name="a">Image data munging</a>
Creates a csv file from image info — RGB, HSV, image height & width

In [6]:
# create list of all the image filepaths
images = []
for root, dirnames, filenames in os.walk('/Users/VanessaG/Desktop/pizza_class_data/'):
    for filename in fnmatch.filter(filenames, '*.jpg'):
        images.append(os.path.join(root, filename))

In [7]:
# create lists for dataframe of image info - rgb, hsv, image height & width
data = []
for img in images:
    image = cv2.imread(img)
    data.append(cv2.normalize(image, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F))
    
rgb_means = []
hsv_means = []
img_height = []
img_width = [] 

for i in data:
    means = cv2.mean(cv2.cvtColor(i, cv2.COLOR_BGR2RGB))[:3]
    rgb_means.append(means)
    means2 = cv2.mean(cv2.cvtColor(i, cv2.COLOR_BGR2HSV))[:3]
    hsv_means.append(means2)
    img_height.append(i.shape[0])
    img_width.append(i.shape[1])

In [8]:
# create dataframes from the lists above
df1 = pd.DataFrame(rgb_means, columns=['red', 'green', 'blue'])
df2 = pd.DataFrame(hsv_means, columns=['hue', 'sat', 'val'])
df = pd.concat([df1, df2], axis=1)

In [9]:
# just nice to have - used to display images in EDA
df['full_path'] = images

In [10]:
# 0 is not pizza, 1 is pizza
df['label'] = df.full_path.map(lambda x: 0 if '/not_pizza/' in x else 1)

In [11]:
# basically confirming labels are correct
df['short_path'] = df['full_path'].replace({'/Users/VanessaG/Desktop/pizza_class_data/': ''}, regex=True)

In [12]:
# add in width, height, total pixels and re-order columns
df['img_height'] = img_height
df['img_width'] = img_width
df['total_px'] = df.img_height * df.img_width
df = df[['label', 'red', 'green', 'blue','hue', 'sat', 'val', 'img_height', 'img_width', 'total_px', 'short_path', 'full_path']]

In [13]:
df.to_csv('image_info.csv')

---
## <a name="b">Image preprocessing</a>
An example of image preprocessing using Keras ImageDataGenerator class.

In [19]:
# example of transformations
datagen = ImageDataGenerator(
    
        #integer value range in degrees(0-180) to randomly rotate images
        rotation_range=40,
    
        rescale=1./255,
    
        #width_shift & height_shift are float ranges (as fraction of total width or height)
        #within which to randomly translate/shift
        width_shift_range=0.2,
        height_shift_range=0.2,
    
        #randomly applying shearing transformations
        shear_range=0.2,
    
        #randomly zooming inside pictures
        zoom_range=0.2,
    
        #randomly flips half of the images horizontally
        #relevant when no assumptions of horizontal assymetry - ie real-world pictures
        horizontal_flip=True,
    
        #strategy for filling newly created pixels
        fill_mode='nearest')

In [20]:
# load PIL image with keras helper function
img = load_img('/Users/VanessaG/Desktop/pizza_class_data/train/pizza/40449.jpg') 

# convert to numpy array 
x = img_to_array(img)

# reshape numpy array - required for keras
x = x.reshape((1,) + x.shape)  

In [21]:
# .flow() generates batches of randomly transformed images and saves the resulting images to the specified directory
i = 0
for batch in datagen.flow(x, batch_size=1, save_to_dir='./transformations/', save_prefix='pizza', save_format='jpg'):
    i += 1
    if i > 20:
        # otherwise the generator would loop indefinitely
        break  