# Fruit and Vegetable Image Processing


In [2]:
#importing required libraries
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog
from skimage import exposure
import matplotlib.pyplot as plt
# Load the Drive helper and mount
#from google.colab import drive
import xarray as x

# 1. Loading and preprocessing<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

In [3]:
import os

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
mountdir = '/content/drive'
drive.mount(mountdir, force_remount=True)

localdir = mountdir + '/MyDrive'
# Replace your folder here
w281_directory = '/Berkeley/w281/Fruit-and-Vegetable-Classification/'
inputdir = localdir + w281_directory
# Uncomment below if using local folder
# inputdir = "/Users/mcliston/Library/CloudStorage/GoogleDrive-michael.c.liston@gmail.com/My Drive/Berkeley/w281/Fruit-and-Vegetable-Classification/"

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
import cv2
#import tensorflow as tf

# Create a list with the filepaths for training and testing
train_dir = Path(inputdir, './input/train')
train_filepaths = list(train_dir.glob(r'**/*.jpg'))

test_dir = Path(inputdir, './input/test')
test_filepaths = list(test_dir.glob(r'**/*.jpg'))

val_dir = Path(inputdir, './input/validation')
val_filepaths = list(test_dir.glob(r'**/*.jpg'))

def proc_img(filepath):
    """ Create a DataFrame with the filepath and the labels of the pictures
    """

    labels = [str(filepath[i]).split("/")[-2] \
              for i in range(len(filepath))]

    filepath = pd.Series(filepath, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # Concatenate filepaths and labels
    df = pd.concat([filepath, labels], axis=1)

    # Shuffle the DataFrame and reset index
    df = df.sample(frac=1).reset_index(drop = True)

    return df

train_df = proc_img(train_filepaths)
test_df = proc_img(test_filepaths)
val_df = proc_img(val_filepaths)

# Filter only selected class

In [5]:
# Fruits - banana, apple, pear, grapes, orange, kiwi, watermelon, pomegranate, pineapple, mango.
# Vegetables - Bell Pepper, Cauliflower, Chilli Pepper, Peas, Corn, Spinach, Turnip, Garlic, Ginger, Cabbage
Fruits = ['banana', 'apple', 'pear', 'grapes', 'orange', 'kiwi', 'watermelon', 'pomegranate', 'pineapple', 'mango']
Vegetables = ['bell pepper', 'cauliflower', 'chilli pepper', 'peas', 'corn', 'spinach', 'turnip', 'garlic', 'ginger', 'cabbage']

train_df = train_df[train_df['Label'].isin(Fruits + Vegetables)]
test_df = test_df[test_df['Label'].isin(Fruits + Vegetables)]
val_df = val_df[val_df['Label'].isin(Fruits + Vegetables)]

In [6]:
print('-- Training set --\n')
print(f'Number of pictures: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Label.unique())}\n')
print(f'Labels: {train_df.Label.unique()}')

-- Training set --

Number of pictures: 1540

Number of different labels: 20

Labels: ['orange' 'pineapple' 'peas' 'garlic' 'kiwi' 'turnip' 'grapes'
 'bell pepper' 'ginger' 'pomegranate' 'mango' 'cabbage' 'banana' 'spinach'
 'apple' 'corn' 'watermelon' 'pear' 'chilli pepper' 'cauliflower']


# Feature Extraction

### Color and Edge Detection functions

In [67]:
RANDOM_SEED = 1234
IMG_DIM = 512
PCA_DIM = 64

def find_edges(img):

  # converting to gray scale
  gray = cv2.cvtColor(np.float32(img), cv2.COLOR_BGR2GRAY)

  # remove noise
  img = cv2.GaussianBlur(gray,(3,3),0)

  # convolve with proper kernels
  laplacian = cv2.Laplacian(img,cv2.CV_32F)
  sobelx = cv2.Sobel(img,cv2.CV_32F,1,0,ksize=5)  # x
  sobely = cv2.Sobel(img,cv2.CV_32F,0,1,ksize=5)  # y

  return laplacian, sobelx, sobely

def get_color_features(im):

  img_512 = resize(im, (512, 512), anti_aliasing=True).astype(np.float32)
  hsv_image = cv2.cvtColor(img_512, cv2.COLOR_RGB2HSV)

  hue_channel = hsv_image[:, :, 0]
  saturation_channel = hsv_image[:, :, 1]
  value_channel = hsv_image[:, :, 2]

  return hue_channel, saturation_channel, value_channel


def image_processing(path):

    img = imread(path)

    resized_img = resize(img, (IMG_DIM,IMG_DIM), anti_aliasing=True)

    #creating hog features

    fd, hog_image = hog(resized_img, orientations=8, pixels_per_cell=(8,8),
                        cells_per_block=(4,4), visualize=True, channel_axis=-1)


    # creating edge features
    laplacian, sobelx, sobely = find_edges(resized_img)

    # creating color features
    hue_channel, saturation_channel, luminance_channel = get_color_features(resized_img)

    feature_lst = [hog_image, hue_channel, saturation_channel, luminance_channel, laplacian, sobelx, sobely]

    resized_features = [resize(z, (PCA_DIM, PCA_DIM), anti_aliasing=True) for z in feature_lst]

    return resized_features

def process_img(df):

  # _labels = []
  # _tmp_dict = {
  #     'HOG': np.array([]),
  #     'Hue': np.array([]),
  #     'Saturation': np.array([]),
  #     'Luminance': np.array([]),
  #     'Laplacian': np.array([]),
  #     'Sobel_X': np.array([]),
  #     'Sobel_Y': np.array([])
  # }

  hog_lst, hue_lst, sat_lst, lum_lst, lap_lst, sob_x_lst, sob_y_lst = [],[],[],[],[],[],[]
  final_labels = []

  for i,j in enumerate(df.iterrows()):
    print(f"Processing image #: {i}")
    filepath,label = j[1]['Filepath'], j[1]['Label']

    try:

      features  = image_processing(filepath)
      final_labels.append(label)

    except Exception as e:

      print(f"There was an {e.__class__.__name__} error while trying to process an image... continuing")
      print(f"The error occurred at image #{i}, {filepath}")
      continue

    for k,l in enumerate([hog_lst,hue_lst,sat_lst,lum_lst,lap_lst,sob_x_lst,sob_y_lst]):
      feature = features[k].reshape(-1).astype(float)
      feature = feature - np.mean(feature)
      l.append(feature)


  hog_arr = np.vstack(hog_lst)
  hue_arr = np.vstack(hue_lst)
  sat_arr = np.vstack(sat_lst)
  lum_arr = np.vstack(lum_lst)
  lap_arr = np.vstack(lap_lst)
  sob_x_arr = np.vstack(sob_x_lst)
  sob_y_arr = np.vstack(sob_y_lst)

  return [hog_arr, hue_arr, sat_arr, lum_arr, lap_arr, sob_x_arr, sob_y_arr, final_labels]



In [63]:
RANDOM_SEED = 1234

train_set = train_df.sample(frac=0.9, random_state=RANDOM_SEED)
val_set = train_df[~(train_df.index.isin(train_set.index))]
test_set = val_df

In [68]:
train_arrays = process_img(train_set)

Processing image #: 0
Processing image #: 1
Processing image #: 2
Processing image #: 3
Processing image #: 4
Processing image #: 5
Processing image #: 6
Processing image #: 7
Processing image #: 8
Processing image #: 9
Processing image #: 10
Processing image #: 11
Processing image #: 12
Processing image #: 13
Processing image #: 14
Processing image #: 15
Processing image #: 16
Processing image #: 17
Processing image #: 18
Processing image #: 19
Processing image #: 20
Processing image #: 21
Processing image #: 22
Processing image #: 23
Processing image #: 24
Processing image #: 25
Processing image #: 26
Processing image #: 27
Processing image #: 28
Processing image #: 29
Processing image #: 30
Processing image #: 31
Processing image #: 32
Processing image #: 33
Processing image #: 34
Processing image #: 35
Processing image #: 36
Processing image #: 37
Processing image #: 38
Processing image #: 39
Processing image #: 40
Processing image #: 41
Processing image #: 42
Processing image #: 4

In [69]:
len(train_arrays)

8

In [76]:
# hog_arr, hue_arr, sat_arr, lum_arr, lap_arr, sob_x_arr, sob_y_arr, final_labels

hog_df = pd.DataFrame(train_arrays[0])
hog_df['label'] = train_arrays[-1]
hog_df.to_csv(inputdir+'modeling/train_hog_features.csv')

hue_df = pd.DataFrame(train_arrays[1])
hue_df['label'] = train_arrays[-1]
hue_df.to_csv(inputdir+'modeling/train_hue_features.csv')

sat_df = pd.DataFrame(train_arrays[2])
sat_df['label'] = train_arrays[-1]
sat_df.to_csv(inputdir+'modeling/train_saturation_features.csv')

lum_df = pd.DataFrame(train_arrays[3])
lum_df['label'] = train_arrays[-1]
lum_df.to_csv(inputdir+'modeling/train_luminance_features.csv')

lap_df = pd.DataFrame(train_arrays[4])
lap_df['label'] = train_arrays[-1]
lap_df.to_csv(inputdir+'modeling/laplacian_features.csv')

sob_x_df = pd.DataFrame(train_arrays[5])
sob_x_df['label'] = train_arrays[-1]
sob_x_df.to_csv(inputdir+'modeling/sobel_x_features.csv')

sob_y_df = pd.DataFrame(train_arrays[6])
sob_y_df['label'] = train_arrays[-1]
sob_y_df.to_csv(inputdir+'modeling/sobel_y_features.csv')

In [73]:
hog_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,...,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,-0.003218,garlic
1,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,...,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,-0.004439,pineapple
2,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,...,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,-0.002269,ginger
3,0.005423,0.001080,-0.003086,-0.003751,-0.001248,0.002969,0.004207,0.004905,-0.000917,-0.000811,...,-0.006114,-0.005862,-0.003421,0.000981,-0.004508,-0.002864,-0.000056,-0.001484,-0.000461,cabbage
4,0.007002,0.001025,-0.000970,-0.000683,0.008981,0.007897,0.001667,-0.007992,-0.011937,-0.009203,...,0.000908,-0.006668,-0.007572,-0.006567,0.005215,0.005217,0.007063,0.001813,-0.007700,chilli pepper
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,-0.001904,-0.001899,-0.001850,-0.001833,-0.001782,-0.001892,-0.001904,-0.001904,-0.001904,-0.001903,...,-0.001904,-0.001904,-0.001904,-0.001904,-0.001904,-0.001904,-0.001904,-0.001904,-0.001904,banana
1382,-0.001459,-0.000151,-0.000426,-0.002445,-0.002061,-0.002414,-0.000850,-0.002777,-0.002723,-0.001789,...,0.004487,-0.000744,-0.001825,-0.001682,-0.000443,0.003328,-0.002581,-0.003153,-0.003067,peas
1383,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,...,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,-0.010838,orange
1384,-0.003782,-0.003782,-0.003782,-0.003782,-0.003782,-0.003782,-0.003782,-0.003782,-0.003782,-0.003782,...,-0.000225,0.000644,0.001074,-0.000705,-0.001301,0.000384,0.001437,0.000140,0.001044,ginger
