In [1]:
#Load in the packages
import pandas as pd
import os, shutil
import time
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.image import imread
import scipy
import numpy as np
from PIL import Image
from scipy import ndimage
from keras.preprocessing.image import ImageDataGenerator, array_to_img

#Set random state for reproducibility
np.random.seed(123)


In [2]:
#Locate the root directory we will be working with
root_dir = os.getcwd()
root_dir

'C:\\Users\\acathcart\\Documents\\AI Academy\\Fruit Classification'

In [3]:
#Find the path to the raw image data folders
apple = root_dir+'\\apple'
plum = root_dir+'\\plum'
tomato = root_dir+'\\tomato'

In [4]:
#Access the contents of the respective raw image data folders
apple_dir = os.listdir(apple)
plum_dir = os.listdir(plum)
tomato_dir = os.listdir(tomato)

#Create a list for the respective fruits to hold the underlying image filenames
apple_fn = []
for i in apple_dir:
    apple_fn.append(i)
    
plum_fn = []
for i in plum_dir:
    plum_fn.append(i)

tomato_fn = []
for i in tomato_dir:
    tomato_fn.append(i)

#Combine filename lists into one comprehensive list
all_fn = apple_fn + plum_fn + tomato_fn

#Create a list for the respective fruits to hold the underlying correct classifications
apple_class = []
for i in apple_dir:
    apple_class.append('apple')
    
plum_class = []
for i in plum_dir:
    plum_class.append('plum')
    
tomato_class = []
for i in tomato_dir:
    tomato_class.append('tomato')
    
#Combine classification lists into one comprehensive list    
all_class = apple_class + plum_class + tomato_class

In [5]:
#Create a Pandas DataFrame to hold image IDs and correct classification
data_manual = pd.DataFrame()
data_manual['id'] = all_fn
data_manual['class'] = all_class

In [6]:
#Quickly spot check the DataFrame to see that the images have been stored correctly
data_manual

Unnamed: 0,id,class
0,102red applee00901102.png,apple
1,103red applee00916103.png,apple
2,107red applee01001107.png,apple
3,108red applee01006108.png,apple
4,109red applee01021109.png,apple
...,...,...
6898,Tamotoes00995.png,tomato
6899,Tamotoes00996.png,tomato
6900,Tamotoes00997.png,tomato
6901,Tamotoes00998.png,tomato


In [7]:
#Create directories for our train, validate, and test sets
dir_names = ['train', 'validate', 'test']
for group in dir_names:
    new_dir = os.path.join(root_dir, group)
    os.mkdir(new_dir)

for fruit in ['apple', 'plum', 'tomato']:
# Create sub_directories by fruit type
    for group in dir_names:
        new_dir = os.path.join(root_dir, group, fruit)
        os.mkdir(new_dir)

In [8]:
#Split the apple images into our train, validate, and test sets
print('Moving {} pictures.'.format('Apple'))
apple_df = data_manual[data_manual['class'] == 'apple']
train_apple, validate_apple, test_apple = np.split(apple_df.sample(frac=1), [int(.8*len(apple_df)), int(.9*len(apple_df))])
print('Split {} imgs into {} train, {} val, and {} test examples.'.format(len(apple_df),
                                                                              len(train_apple),
                                                                              len(validate_apple),
                                                                              len(test_apple)))

#Copy apple images to their correct directory & sub_directory   
for i, temp in enumerate([train_apple]):
    for row in train_apple.index:
        filename = apple_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'apple' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'train' + '\\' + 'apple' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([validate_apple]):
    for row in validate_apple.index:
        filename = apple_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'apple' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'validate' + '\\' + 'apple' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([test_apple]):
    for row in test_apple.index:
        filename = apple_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'apple' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'test' + '\\' + 'apple' + '\\' + filename)
        shutil.copy(origin, destination)

Moving Apple pictures.
Split 2434 imgs into 1947 train, 243 val, and 244 test examples.


In [9]:
#Split the plum images into our train, validate, and test sets
print('Moving {} pictures.'.format('Plum'))
plum_df = data_manual[data_manual['class'] == 'plum']
train_plum, validate_plum, test_plum = np.split(plum_df.sample(frac=1), [int(.8*len(plum_df)), int(.9*len(plum_df))])
print('Split {} imgs into {} train, {} val, and {} test examples.'.format(len(plum_df),
                                                                              len(train_plum),
                                                                              len(validate_plum),
                                                                              len(test_plum)))

#Copy plum images to their correct directory & sub_directory   
for i, temp in enumerate([train_plum]):
    for row in train_plum.index:
        filename = plum_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'plum' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'train' + '\\' + 'plum' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([validate_plum]):
    for row in validate_plum.index:
        filename = plum_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'plum' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'validate' + '\\' + 'plum' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([test_plum]):
    for row in test_plum.index:
        filename = plum_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'plum' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'test' + '\\' + 'plum' + '\\' + filename)
        shutil.copy(origin, destination)

Moving Plum pictures.
Split 2298 imgs into 1838 train, 230 val, and 230 test examples.


In [10]:
#Split the tomato images into our train, validate, and test sets
print('Moving {} pictures.'.format('Tomato'))
tomato_df = data_manual[data_manual['class'] == 'tomato']
train_tomato, validate_tomato, test_tomato = np.split(tomato_df.sample(frac=1), [int(.8*len(tomato_df)), int(.9*len(tomato_df))])
print('Split {} imgs into {} train, {} val, and {} test examples.'.format(len(plum_df),
                                                                              len(train_tomato),
                                                                              len(validate_tomato),
                                                                              len(test_tomato)))

#Copy tomato images to their correct directory & sub_directory   
for i, temp in enumerate([train_tomato]):
    for row in train_tomato.index:
        filename = tomato_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'tomato' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'train' + '\\' + 'tomato' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([validate_tomato]):
    for row in validate_tomato.index:
        filename = tomato_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'tomato' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'validate' + '\\' + 'tomato' + '\\' + filename)
        shutil.copy(origin, destination)
            
for i, temp in enumerate([test_tomato]):
    for row in test_tomato.index:
        filename = tomato_df['id'][row]
        origin = os.path.join(root_dir + '\\' + 'tomato' + '\\' + filename)
        destination = os.path.join(root_dir + '\\' + 'test' + '\\' + 'tomato' + '\\' + filename)
        shutil.copy(origin, destination)

Moving Tomato pictures.
Split 2298 imgs into 1736 train, 217 val, and 218 test examples.
