<a href="https://colab.research.google.com/github/MichaelTay/w281-summer-2023-project/blob/main/image_processing_to_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fruit and Vegetable Image Processing


In [68]:
#importing required libraries
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog, daisy
from skimage.color import rgb2gray
from skimage import exposure
import matplotlib.pyplot as plt
import pickle
import gc

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the Drive helper and mount
#from google.colab import drive
import xarray as x
import sys

# 1. Loading and preprocessing<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

In [2]:
import os

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
mountdir = '/content/drive'
drive.mount(mountdir, force_remount=True)

localdir = mountdir + '/MyDrive'
# Replace your folder here
w281_directory = '/Berkeley/w281/Fruit-and-Vegetable-Classification/'
inputdir = localdir + w281_directory
# Uncomment below if using local folder
# inputdir = "/Users/mcliston/Library/CloudStorage/GoogleDrive-michael.c.liston@gmail.com/My Drive/Berkeley/w281/Fruit-and-Vegetable-Classification/"

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
import cv2
#import tensorflow as tf

# Create a list with the filepaths for training and testing
train_dir = Path(inputdir, './input/train')
train_filepaths = list(train_dir.glob(r'**/*.jpg'))

test_dir = Path(inputdir, './input/test')
test_filepaths = list(test_dir.glob(r'**/*.jpg'))

val_dir = Path(inputdir, './input/validation')
val_filepaths = list(test_dir.glob(r'**/*.jpg'))

def proc_img(filepath):
    """ Create a DataFrame with the filepath and the labels of the pictures
    """

    labels = [str(filepath[i]).split("/")[-2] \
              for i in range(len(filepath))]

    filepath = pd.Series(filepath, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # Concatenate filepaths and labels
    df = pd.concat([filepath, labels], axis=1)

    # Shuffle the DataFrame and reset index
    df = df.sample(frac=1).reset_index(drop = True)

    return df

train_df = proc_img(train_filepaths)
test_df = proc_img(test_filepaths)
val_df = proc_img(val_filepaths)

# Filter only selected class

In [4]:
# Fruits - banana, apple, pear, grapes, orange, kiwi, watermelon, pomegranate, pineapple, mango.
# Vegetables - Bell Pepper, Cauliflower, Chilli Pepper, Peas, Corn, Spinach, Turnip, Garlic, Ginger, Cabbage
Fruits = ['banana', 'apple', 'pear', 'grapes', 'orange', 'kiwi', 'watermelon', 'pomegranate', 'pineapple', 'mango']
Vegetables = ['bell pepper', 'cauliflower', 'chilli pepper', 'peas', 'corn', 'spinach', 'turnip', 'garlic', 'ginger', 'cabbage']

train_df = train_df[train_df['Label'].isin(Fruits + Vegetables)]
test_df = test_df[test_df['Label'].isin(Fruits + Vegetables)]
val_df = val_df[val_df['Label'].isin(Fruits + Vegetables)]

In [5]:
print('-- Training set --\n')
print(f'Number of pictures: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Label.unique())}\n')
print(f'Labels: {train_df.Label.unique()}')

-- Training set --

Number of pictures: 1540

Number of different labels: 20

Labels: ['pomegranate' 'watermelon' 'mango' 'chilli pepper' 'bell pepper' 'pear'
 'garlic' 'cabbage' 'apple' 'grapes' 'kiwi' 'pineapple' 'turnip' 'spinach'
 'banana' 'cauliflower' 'ginger' 'orange' 'corn' 'peas']


# Feature Extraction

### Color and Edge Detection functions

In [6]:
RANDOM_SEED = 1234
IMG_DIM = 512
PCA_DIM = 64

def find_edges(img):

  # converting to gray scale
  gray = cv2.cvtColor(np.float32(img), cv2.COLOR_BGR2GRAY)

  # remove noise
  img = cv2.GaussianBlur(gray,(3,3),0)

  # convolve with proper kernels
  laplacian = cv2.Laplacian(img,cv2.CV_32F)
  sobelx = cv2.Sobel(img,cv2.CV_32F,1,0,ksize=5)  # x
  sobely = cv2.Sobel(img,cv2.CV_32F,0,1,ksize=5)  # y

  return laplacian, sobelx, sobely

def get_color_features(im):

  img_512 = resize(im, (512, 512), anti_aliasing=True).astype(np.float32)
  hsv_image = cv2.cvtColor(img_512, cv2.COLOR_RGB2HSV)

  hue_channel = hsv_image[:, :, 0]
  saturation_channel = hsv_image[:, :, 1]
  value_channel = hsv_image[:, :, 2]

  return hue_channel, saturation_channel, value_channel


def image_processing(path):

    img = imread(path)

    resized_img = resize(img, (IMG_DIM,IMG_DIM), anti_aliasing=True)

    #creating hog features

    fd, hog_image = hog(resized_img, orientations=8, pixels_per_cell=(8,8),
                        cells_per_block=(4,4), visualize=True, channel_axis=-1)


    # creating edge features
    laplacian, sobelx, sobely = find_edges(resized_img)

    # creating color features
    hue_channel, saturation_channel, luminance_channel = get_color_features(resized_img)

    gray_img = rgb2gray(resized_img)
    descs = daisy(gray_img, step=150, radius=40, rings=2, histograms=6, orientations=8, visualize=False)

    feature_lst = [hog_image, hue_channel, saturation_channel, luminance_channel, laplacian, sobelx, sobely, descs]

    resized_features = [resize(z, (PCA_DIM, PCA_DIM), anti_aliasing=True) for z in feature_lst]

    return resized_features

def process_img(df):

  hog_lst, hue_lst, sat_lst, lum_lst, lap_lst, sob_x_lst, sob_y_lst, daisy_lst = [],[],[],[],[],[],[],[]
  final_labels = []

  for i,j in enumerate(df.iterrows()):
    print(f"Processing image #: {i}")
    filepath,label = j[1]['Filepath'], j[1]['Label']

    try:

      features  = image_processing(filepath)
      final_labels.append(label)

    except Exception as e:

      print(f"There was an {e.__class__.__name__} error while trying to process an image... continuing")
      print(f"The error occurred at image #{i}, {filepath}")
      continue

    for k,l in enumerate([hog_lst,hue_lst,sat_lst,lum_lst,lap_lst,sob_x_lst,sob_y_lst,daisy_lst]):
      feature = features[k].reshape(-1).astype(float)
      feature = feature - np.mean(feature)
      l.append(feature)


  hog_arr = np.vstack(hog_lst)
  hue_arr = np.vstack(hue_lst)
  sat_arr = np.vstack(sat_lst)
  lum_arr = np.vstack(lum_lst)
  lap_arr = np.vstack(lap_lst)
  sob_x_arr = np.vstack(sob_x_lst)
  sob_y_arr = np.vstack(sob_y_lst)
  daisy_arr = np.vstack(daisy_lst)

  return [hog_arr, hue_arr, sat_arr, lum_arr, lap_arr, sob_x_arr, sob_y_arr, daisy_arr, final_labels]



In [7]:
RANDOM_SEED = 1234

# train_set = train_df.sample(frac=0.9, random_state=RANDOM_SEED)
# val_set = train_df[~(train_df.index.isin(train_set.index))]
# test_set = val_df

train_set = train_df
val_set = val_df
test_set = test_df

#### Making Training set

In [None]:
train_arrays = process_img(train_set)

In [9]:
#primary features: Daisy, Luminance, Saturation, Hue, Maybe (HOG)

# get training data
full_dataset = np.concatenate((train_arrays[7],
                               train_arrays[3],
                               train_arrays[2],
                               train_arrays[1]), axis=1)


In [37]:
## scale training set
scaler = StandardScaler()
scaled_full_dataset = scaler.fit_transform(full_dataset)

# save scaler
pickle_dir = ''
pickle.dump(scaler, open(pickle_dir+'standardscaler.pkl', 'wb'))

In [42]:
# pca training set
RANDOM_SEED = 1234
pca = PCA(n_components=50, random_state=RANDOM_SEED)
X = pca.fit_transform(scaled_full_dataset)

## save pca object
pickle_dir = ''
pickle.dump(pca, open(pickle_dir+'pca.pkl', 'wb'))

In [45]:
## combine training data with labels
X = np.concatenate((X,np.array((train_arrays[-1])).reshape(-1,1)), axis=1)
X_df = pd.DataFrame(X)

# Daisy, Luminance, Saturation, Hue
pca_dataset_dir = ''
X_df.to_csv(pca_dataset_dir+'training_daisy_lum_sat_hue.csv')

#### Making Validation set

In [None]:
validation_arrays = process_img(val_set)

In [83]:

#primary features: Daisy, Luminance, Saturation, Hue, Maybe (HOG)
## get validation dataset
full_validation_dataset = np.concatenate((validation_arrays[7],
                               validation_arrays[3],
                               validation_arrays[2],
                               validation_arrays[1]), axis=1)

In [84]:
## scale, apply pca and export
scaled_validation_dataset = scaler.transform(full_validation_dataset)
X_val = pca.transform(scaled_validation_dataset)
X_val = np.concatenate((X_val,np.array((validation_arrays[-1])).reshape(-1,1)), axis=1)
X_val_df = pd.DataFrame(X_val)
X_val_df.to_csv(pca_dataset_dir+'validation_daisy_lum_sat_hue.csv')

#### Making Testing set

In [None]:
test_arrays = process_img(test_set)

In [87]:
# hog_arr, hue_arr, sat_arr, lum_arr, lap_arr, sob_x_arr, sob_y_arr, final_labels

## get test dataset
full_test_dataset = np.concatenate((test_arrays[7],
                               test_arrays[3],
                               test_arrays[2],
                               test_arrays[1]), axis=1)

In [96]:
## scale, apply pca and export
scaled_test_dataset = scaler.transform(full_test_dataset)
X_test = pca.transform(scaled_test_dataset)
X_test = np.concatenate((X_test,np.array((test_arrays[-1])).reshape(-1,1)), axis=1)
X_test_df = pd.DataFrame(X_test)
X_test_df.to_csv(pca_dataset_dir+'test_daisy_lum_sat_hue.csv')

In [98]:
print(X_df.shape)
print(X_val_df.shape)
print(X_test_df.shape)

(1535, 51)
(187, 51)
(187, 51)
