# Apples, Plums, and Tomatoes! Oh My! - Image Classification Technology for Grocery Stores

In [11]:
#Load in the packages
import pandas as pd
import os, shutil
import time
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.image import imread
import scipy
import numpy as np
from PIL import Image
from scipy import ndimage
from keras.preprocessing.image import ImageDataGenerator, array_to_img
from keras import layers
from keras import models
from keras import optimizers
import datetime

#Set random state for reproducibility
np.random.seed(123)


### Data Cleaning 
###### The data cleaning stage consisted of 3 main steps. First, outside of the Jupyter Notebook, delete the subfolders labeled "Apple D", "Apple E", "Apple F", and "Total Number of Apples". This is done to fix any imbalance across the classes. Next, we must extract the individual raw image files from the data import. It's important to access individual filenames because it allows the images to be mapped to the correct class label in a Pandas DataFrame. This DataFrame is considered the "source of truth" for our model. Once we have the DataFrame established, we will set up a train, validate, test folder structure, with sub-directories based on class,  to form the foundation of our image classification model.

In [12]:
#Locate the root directory we will be working with
root_dir = os.getcwd()
root_dir

'C:\\Users\\acathcart\\Documents\\AI Academy\\Fruit Classification'

In [13]:
#Find the path to the raw image data folders
apple = root_dir+'\\apple'
plum = root_dir+'\\plum'
tomato = root_dir+'\\tomato'

In [14]:
#Access the contents of the respective raw image data folders
apple_dir = os.listdir(apple)
plum_dir = os.listdir(plum)
tomato_dir = os.listdir(tomato)

#Create a list for the respective fruits to hold the underlying image filenames
apple_fn = []
for i in apple_dir:
    apple_fn.append(i)
    
plum_fn = []
for i in plum_dir:
    plum_fn.append(i)

tomato_fn = []
for i in tomato_dir:
    tomato_fn.append(i)

#Combine filename lists into one comprehensive list
all_fn = apple_fn + plum_fn + tomato_fn

#Create a list for the respective fruits to hold the underlying correct classifications
apple_class = []
for i in apple_dir:
    apple_class.append('apple')
    
plum_class = []
for i in plum_dir:
    plum_class.append('plum')
    
tomato_class = []
for i in tomato_dir:
    tomato_class.append('tomato')
    
#Combine classification lists into one comprehensive list    
all_class = apple_class + plum_class + tomato_class

In [15]:
#Create a Pandas DataFrame to hold image IDs and correct classification
data_manual = pd.DataFrame()
data_manual['id'] = all_fn
data_manual['class'] = all_class

In [16]:
#Quickly spot check the DataFrame to see that the images have been stored correctly
data_manual

Unnamed: 0,id,class
0,102red applee00901102.png,apple
1,103red applee00916103.png,apple
2,107red applee01001107.png,apple
3,108red applee01006108.png,apple
4,109red applee01021109.png,apple
...,...,...
6898,Tamotoes00995.png,tomato
6899,Tamotoes00996.png,tomato
6900,Tamotoes00997.png,tomato
6901,Tamotoes00998.png,tomato


In [17]:
#Create directories for our train, validate, and test sets
dir_names = ['train', 'validate', 'test']
for group in dir_names:
    new_dir = os.path.join(root_dir, group)
    os.mkdir(new_dir)

for fruit in ['apple', 'plum', 'tomato']:
# Create sub_directories by fruit type
    for group in dir_names:
        new_dir = os.path.join(root_dir, group, fruit)
        os.mkdir(new_dir)

## Splitting Data Into Train, Validate, Test Sets
#### We will randomly assign the raw images into train, validate, and test sets to safeguard the model from overfitting and give a more accurate evaluation of the model's performance. To do so, we will distribute the images along an 80:10:10 train : validate : test ratio.

In [18]:
#Split the apple images into our train, validate, and test sets
print('Moving {} pictures.'.format('Apple'))
apple_df = data_manual[data_manual['class'] == 'apple']
train_apple, validate_apple, test_apple = np.split(apple_df.sample(frac=1), [int(.8*len(apple_df)), int(.9*len(apple_df))])
print('Split {} imgs into {} train, {} val, and {} test examples.'.format(len(apple_df),
                                                                              len(train_apple),
                                                                              len(validate_apple),
                                                                              len(test_apple)))

#Copy apple images to their correct directory & sub_directory   
for i, temp in enumerate([train_apple]):
    for row in train_apple.index:
        filename = apple_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'apple' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'train' + '\\' + 'apple' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([validate_apple]):
    for row in validate_apple.index:
        filename = apple_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'apple' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'validate' + '\\' + 'apple' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([test_apple]):
    for row in test_apple.index:
        filename = apple_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'apple' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'test' + '\\' + 'apple' + '\\' + filename)
        shutil.copy(origin, destination)

Moving Apple pictures.
Split 2434 imgs into 1947 train, 243 val, and 244 test examples.


In [19]:
#Split the plum images into our train, validate, and test sets
print('Moving {} pictures.'.format('Plum'))
plum_df = data_manual[data_manual['class'] == 'plum']
train_plum, validate_plum, test_plum = np.split(plum_df.sample(frac=1), [int(.8*len(plum_df)), int(.9*len(plum_df))])
print('Split {} imgs into {} train, {} val, and {} test examples.'.format(len(plum_df),
                                                                              len(train_plum),
                                                                              len(validate_plum),
                                                                              len(test_plum)))

#Copy plum images to their correct directory & sub_directory   
for i, temp in enumerate([train_plum]):
    for row in train_plum.index:
        filename = plum_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'plum' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'train' + '\\' + 'plum' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([validate_plum]):
    for row in validate_plum.index:
        filename = plum_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'plum' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'validate' + '\\' + 'plum' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([test_plum]):
    for row in test_plum.index:
        filename = plum_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'plum' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'test' + '\\' + 'plum' + '\\' + filename)
        shutil.copy(origin, destination)

Moving Plum pictures.
Split 2298 imgs into 1838 train, 230 val, and 230 test examples.


In [20]:
#Split the tomato images into our train, validate, and test sets
print('Moving {} pictures.'.format('Tomato'))
tomato_df = data_manual[data_manual['class'] == 'tomato']
train_tomato, validate_tomato, test_tomato = np.split(tomato_df.sample(frac=1), [int(.8*len(tomato_df)), int(.9*len(tomato_df))])
print('Split {} imgs into {} train, {} val, and {} test examples.'.format(len(plum_df),
                                                                              len(train_tomato),
                                                                              len(validate_tomato),
                                                                              len(test_tomato)))

#Copy tomato images to their correct directory & sub_directory   
for i, temp in enumerate([train_tomato]):
    for row in train_tomato.index:
        filename = tomato_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'tomato' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'train' + '\\' + 'tomato' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([validate_tomato]):
    for row in validate_tomato.index:
        filename = tomato_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'tomato' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'validate' + '\\' + 'tomato' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([test_tomato]):
    for row in test_tomato.index:
        filename = tomato_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'tomato' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'test' + '\\' + 'tomato' + '\\' + filename)
        shutil.copy(origin, destination)

Moving Tomato pictures.
Split 2298 imgs into 1736 train, 217 val, and 218 test examples.


In [21]:
#Define more convinient way of calling path to train, validate, and test directories
train_dir = '{}\\train\\'.format(root_dir)
validate_dir = '{}\\validate\\'.format(root_dir)
test_dir = '{}\\test\\'.format(root_dir)

## Building the Model
#### We will be employing a pre-trained CNN (VGG19) to form the base of our model. This base allows our model to be significantly more robust in that it is now built upon a 19-layer deep neural network. 

In [22]:
# Initialize Base
from keras.applications import VGG19
cnn_base = VGG19(weights='imagenet',
                 include_top=False,
                 input_shape=(240, 240, 3))

# Define Model Architecture (AI Academy / Modules / " " Lab)
model = models.Sequential()
model.add(cnn_base)
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

#Freeze the layers
cnn_base.trainable = False

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg19 (Functional)           (None, 7, 7, 512)         20024384  
_________________________________________________________________
flatten (Flatten)            (None, 25088)             0         
_________________________________________________________________
dense (Dense)                (None, 64)                1605696   
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 3

## Training the Model
#### This code chunk was inspired by the M4. Image Classification - Lab (https://github.com/learn-co-curriculum/dsc-image-classification-lab/tree/solution)

In [23]:
start = datetime.datetime.now()

# All images will be rescaled by 1/255
datagen = ImageDataGenerator(rescale=1/255)

#Generate batches of normalized data
train_generator = datagen.flow_from_directory(train_dir,
                                                    target_size=(240, 240),
                                                    batch_size=50,
                                                    class_mode='categorical')

validate_generator = datagen.flow_from_directory(validate_dir, 
                                                        target_size=(240, 240), 
                                                        batch_size=50, 
                                                        class_mode='categorical')

test_generator = datagen.flow_from_directory(test_dir,
                                                  target_size=(240, 240),
                                                  batch_size=692,
                                                  class_mode='categorical')

test_images, test_labels = next(test_generator)

#Compilation
model.compile(loss='categorical_crossentropy',
              optimizer = optimizers.RMSprop(lr=2e-5), #Learning rate: must be small to avoid overfitting
              metrics=['acc'])

#Fitting the Model
history = model.fit_generator(train_generator,
                             steps_per_epoch=10,
                             epochs=5,
                             validation_data=validate_generator,
                             validation_steps=10)
end = datetime.datetime.now()
duration = end - start
print('Training took {}'.format(duration))

Found 5521 images belonging to 3 classes.
Found 690 images belonging to 3 classes.
Found 692 images belonging to 3 classes.
Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training took 0:09:06.710486


In [24]:
#Check batch index
train_generator.class_indices

{'apple': 0, 'plum': 1, 'tomato': 2}

## Testing the Model

In [26]:
#Evaluating the model's performance using the test set
test_generator = datagen.flow_from_directory(test_dir,
                                                  target_size=(240, 240),
                                                  batch_size=50,
                                                  class_mode='categorical',
                                                  shuffle=False)

test_loss, test_acc = model.evaluate(test_generator, steps=10)
y_hat_test = model.predict(test_generator, steps=10)
print('Generated {} predictions'.format(len(y_hat_test)))
print('test acc:', test_acc)

Found 692 images belonging to 3 classes.
Generated 500 predictions
test acc: 0.8939999938011169
