<a href="https://colab.research.google.com/github/MichaelTay/w281-summer-2023-project/blob/main/image_processing_to_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fruit and Vegetable Image Processing


In [1]:
#importing required libraries
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog, daisy
from skimage.color import rgb2gray
from skimage import exposure
import matplotlib.pyplot as plt
import pickle
import gc

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the Drive helper and mount
#from google.colab import drive
import xarray as x
import sys

# 1. Loading and preprocessing<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

In [11]:
import os

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
mountdir = '/content/drive'
drive.mount(mountdir, force_remount=True)

localdir = mountdir + '/MyDrive'
# Replace your folder here
w281_directory = '/Berkeley/w281/Fruit-and-Vegetable-Classification/'
inputdir = localdir + w281_directory
pca_dataset_dir = inputdir + 'modeling/pca_datasets/'
pca_pickle_dir = inputdir + 'modeling/pca_pickle/'
# Uncomment below if using local folder
# inputdir = "/Users/mcliston/Library/CloudStorage/GoogleDrive-michael.c.liston@gmail.com/My Drive/Berkeley/w281/Fruit-and-Vegetable-Classification/"

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
import cv2
#import tensorflow as tf

# Create a list with the filepaths for training and testing
train_dir = Path(inputdir, './input/train')
train_filepaths = list(train_dir.glob(r'**/*.jpg'))

test_dir = Path(inputdir, './input/test')
test_filepaths = list(test_dir.glob(r'**/*.jpg'))

val_dir = Path(inputdir, './input/validation')
val_filepaths = list(test_dir.glob(r'**/*.jpg'))

def proc_img(filepath):
    """ Create a DataFrame with the filepath and the labels of the pictures
    """

    labels = [str(filepath[i]).split("/")[-2] \
              for i in range(len(filepath))]

    filepath = pd.Series(filepath, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # Concatenate filepaths and labels
    df = pd.concat([filepath, labels], axis=1)

    # Shuffle the DataFrame and reset index
    df = df.sample(frac=1).reset_index(drop = True)

    return df

train_df = proc_img(train_filepaths)
test_df = proc_img(test_filepaths)
val_df = proc_img(val_filepaths)

In [26]:
import gc

# Filter only selected class

In [4]:
# Fruits - banana, apple, pear, grapes, orange, kiwi, watermelon, pomegranate, pineapple, mango.
# Vegetables - Bell Pepper, Cauliflower, Chilli Pepper, Peas, Corn, Spinach, Turnip, Garlic, Ginger, Cabbage
Fruits = ['banana', 'apple', 'pear', 'grapes', 'orange', 'kiwi', 'watermelon', 'pomegranate', 'pineapple', 'mango']
Vegetables = ['bell pepper', 'cauliflower', 'chilli pepper', 'peas', 'corn', 'spinach', 'turnip', 'garlic', 'ginger', 'cabbage']

train_df = train_df[train_df['Label'].isin(Fruits + Vegetables)]
test_df = test_df[test_df['Label'].isin(Fruits + Vegetables)]
val_df = val_df[val_df['Label'].isin(Fruits + Vegetables)]

In [5]:
print('-- Training set --\n')
print(f'Number of pictures: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Label.unique())}\n')
print(f'Labels: {train_df.Label.unique()}')

-- Training set --

Number of pictures: 1540

Number of different labels: 20

Labels: ['pear' 'peas' 'ginger' 'banana' 'corn' 'turnip' 'cabbage' 'pineapple'
 'bell pepper' 'cauliflower' 'orange' 'grapes' 'mango' 'chilli pepper'
 'garlic' 'spinach' 'pomegranate' 'watermelon' 'kiwi' 'apple']


# Feature Extraction

### Color and Edge Detection functions

In [6]:
RANDOM_SEED = 1234
IMG_DIM = 512
PCA_DIM = 64

def find_edges(img):

  # converting to gray scale
  gray = cv2.cvtColor(np.float32(img), cv2.COLOR_BGR2GRAY)

  # remove noise
  img = cv2.GaussianBlur(gray,(3,3),0)

  # convolve with proper kernels
  laplacian = cv2.Laplacian(img,cv2.CV_32F)
  sobelx = cv2.Sobel(img,cv2.CV_32F,1,0,ksize=5)  # x
  sobely = cv2.Sobel(img,cv2.CV_32F,0,1,ksize=5)  # y

  return laplacian, sobelx, sobely

def get_color_features(im):

  img_512 = resize(im, (512, 512), anti_aliasing=True).astype(np.float32)
  hsv_image = cv2.cvtColor(img_512, cv2.COLOR_RGB2HSV)

  hue_channel = hsv_image[:, :, 0]
  saturation_channel = hsv_image[:, :, 1]
  value_channel = hsv_image[:, :, 2]

  return hue_channel, saturation_channel, value_channel


def image_processing(path):

    img = imread(path)

    resized_img = resize(img, (IMG_DIM,IMG_DIM), anti_aliasing=True)

    #creating hog features

    fd, hog_image = hog(resized_img, orientations=8, pixels_per_cell=(8,8),
                        cells_per_block=(4,4), visualize=True, channel_axis=-1)


    # creating edge features
    laplacian, sobelx, sobely = find_edges(resized_img)

    # creating color features
    hue_channel, saturation_channel, luminance_channel = get_color_features(resized_img)

    gray_img = rgb2gray(resized_img)
    descs = daisy(gray_img, step=150, radius=40, rings=2, histograms=6, orientations=8, visualize=False)

    feature_lst = [hog_image, hue_channel, saturation_channel, luminance_channel, laplacian, sobelx, sobely, descs]

    resized_features = [resize(z, (PCA_DIM, PCA_DIM), anti_aliasing=True) for z in feature_lst]

    return resized_features

def process_img(df):

  hog_lst, hue_lst, sat_lst, lum_lst, lap_lst, sob_x_lst, sob_y_lst, daisy_lst = [],[],[],[],[],[],[],[]
  final_labels = []

  for i,j in enumerate(df.iterrows()):
    print(f"Processing image #: {i}")
    filepath,label = j[1]['Filepath'], j[1]['Label']

    try:

      features  = image_processing(filepath)
      final_labels.append(label)

    except Exception as e:

      print(f"There was an {e.__class__.__name__} error while trying to process an image... continuing")
      print(f"The error occurred at image #{i}, {filepath}")
      continue

    for k,l in enumerate([hog_lst,hue_lst,sat_lst,lum_lst,lap_lst,sob_x_lst,sob_y_lst,daisy_lst]):
      feature = features[k].reshape(-1).astype(float)
      feature = feature - np.mean(feature)
      l.append(feature)


  hog_arr = np.vstack(hog_lst)
  hue_arr = np.vstack(hue_lst)
  sat_arr = np.vstack(sat_lst)
  lum_arr = np.vstack(lum_lst)
  lap_arr = np.vstack(lap_lst)
  sob_x_arr = np.vstack(sob_x_lst)
  sob_y_arr = np.vstack(sob_y_lst)
  daisy_arr = np.vstack(daisy_lst)

  return [hog_arr, hue_arr, sat_arr, lum_arr, lap_arr, sob_x_arr, sob_y_arr, daisy_arr, final_labels]



In [7]:
RANDOM_SEED = 1234

# train_set = train_df.sample(frac=0.9, random_state=RANDOM_SEED)
# val_set = train_df[~(train_df.index.isin(train_set.index))]
# test_set = val_df

train_set = train_df
val_set = val_df
test_set = test_df

#### Making Training set

In [None]:
train_arrays = process_img(train_set)

In [12]:

## scale daisy feature
daisy_scaler = StandardScaler()
scaled_daisy = daisy_scaler.fit_transform(train_arrays[7])

# save daisy scaler
pickle.dump(daisy_scaler, open(pca_pickle_dir+'daisy_standardscaler.pkl', 'wb'))

## scale luminance feature
lum_scaler = StandardScaler()
scaled_lum = lum_scaler.fit_transform(train_arrays[3])

# save scaler
pickle.dump(lum_scaler, open(pca_pickle_dir+'lum_standardscaler.pkl', 'wb'))

## scale saturation feature
sat_scaler = StandardScaler()
scaled_sat = sat_scaler.fit_transform(train_arrays[2])

# save scaler
pickle.dump(sat_scaler, open(pca_pickle_dir+'sat_standardscaler.pkl', 'wb'))

## scale hue feature
hue_scaler = StandardScaler()
scaled_hue = hue_scaler.fit_transform(train_arrays[1])

# save scaler
pickle.dump(hue_scaler, open(pca_pickle_dir+'hue_standardscaler.pkl', 'wb'))

In [13]:
RANDOM_SEED = 1234

# pca daisy feature
daisy_pca = PCA(n_components=50, random_state=RANDOM_SEED)
X_daisy = daisy_pca.fit_transform(scaled_daisy)

## save pca object
pickle.dump(daisy_pca, open(pca_pickle_dir+'daisy_pca.pkl', 'wb'))

# pca luminance feature
lum_pca = PCA(n_components=50, random_state=RANDOM_SEED)
X_lum = lum_pca.fit_transform(scaled_lum)

## save pca object
pickle.dump(lum_pca, open(pca_pickle_dir+'lum_pca.pkl', 'wb'))

# pca saturation feature
sat_pca = PCA(n_components=50, random_state=RANDOM_SEED)
X_sat = sat_pca.fit_transform(scaled_sat)

## save pca object
pickle.dump(sat_pca, open(pca_pickle_dir+'sat_pca.pkl', 'wb'))

# pca hue feature
hue_pca = PCA(n_components=50, random_state=RANDOM_SEED)
X_hue = hue_pca.fit_transform(scaled_hue)

## save pca object
pickle.dump(hue_pca, open(pca_pickle_dir+'hue_pca.pkl', 'wb'))

In [14]:
#primary features: Daisy (7), Luminance (3), Saturation (2), Hue (1), Maybe (HOG (0))

# get training data
X = np.concatenate((X_daisy,
                    X_lum,
                    X_sat,
                    X_hue), axis=1)


In [22]:
!ls drive/MyDrive/Berkeley/w281/Fruit-and-Vegetable-Classification/modeling/pca_datasets/

test_daisy_lum_sat_hue.csv	validation_daisy_lum_sat_hue.csv
training_daisy_lum_sat_hue.csv


In [24]:
## combine training data with labels
X = np.concatenate((X,np.array((train_arrays[-1])).reshape(-1,1)), axis=1)
X_df = pd.DataFrame(X)

# Daisy, Luminance, Saturation, Hue

X_df.to_csv(pca_dataset_dir+'training_daisy_lum_sat_hue.csv')

In [63]:
X_df.shape

(1535, 201)

#### Making Validation set

In [30]:
validation_arrays = process_img(val_set)

Processing image #: 0
Processing image #: 1
Processing image #: 2
Processing image #: 3
Processing image #: 4
Processing image #: 5
Processing image #: 6
Processing image #: 7
Processing image #: 8
Processing image #: 9
Processing image #: 10
Processing image #: 11
Processing image #: 12
Processing image #: 13
Processing image #: 14
Processing image #: 15
Processing image #: 16
Processing image #: 17
Processing image #: 18
Processing image #: 19
Processing image #: 20
Processing image #: 21
Processing image #: 22
Processing image #: 23
Processing image #: 24
Processing image #: 25
Processing image #: 26
Processing image #: 27
Processing image #: 28
Processing image #: 29
Processing image #: 30
Processing image #: 31
Processing image #: 32
Processing image #: 33
Processing image #: 34
Processing image #: 35
Processing image #: 36
Processing image #: 37
Processing image #: 38
Processing image #: 39
Processing image #: 40
Processing image #: 41
Processing image #: 42
Processing image #: 4



Processing image #: 50
Processing image #: 51
Processing image #: 52
Processing image #: 53
Processing image #: 54
Processing image #: 55
Processing image #: 56
Processing image #: 57
Processing image #: 58
Processing image #: 59
Processing image #: 60
Processing image #: 61
Processing image #: 62
Processing image #: 63
Processing image #: 64
Processing image #: 65
Processing image #: 66
Processing image #: 67
Processing image #: 68
Processing image #: 69
Processing image #: 70
Processing image #: 71
Processing image #: 72
Processing image #: 73
Processing image #: 74
Processing image #: 75
Processing image #: 76
Processing image #: 77
Processing image #: 78
Processing image #: 79
Processing image #: 80
Processing image #: 81
Processing image #: 82
Processing image #: 83
Processing image #: 84
Processing image #: 85
Processing image #: 86
Processing image #: 87
Processing image #: 88
Processing image #: 89
Processing image #: 90
Processing image #: 91
Processing image #: 92
Processing 

In [31]:

## scale daisy feature
scaled_val_daisy = daisy_scaler.fit_transform(validation_arrays[7])

## scale luminance feature
scaled_val_lum = lum_scaler.fit_transform(validation_arrays[3])

## scale saturation feature
scaled_val_sat = sat_scaler.fit_transform(validation_arrays[2])

## scale hue feature
scaled_val_hue = hue_scaler.fit_transform(validation_arrays[1])

In [32]:
RANDOM_SEED = 1234

# pca daisy feature
X_val_daisy = daisy_pca.fit_transform(scaled_val_daisy)

# pca luminance feature
X_val_lum = lum_pca.fit_transform(scaled_val_lum)

# pca saturation feature
X_val_sat = sat_pca.fit_transform(scaled_val_sat)

# pca hue feature
X_val_hue = hue_pca.fit_transform(scaled_val_hue)


In [33]:

#primary features: Daisy, Luminance, Saturation, Hue, Maybe (HOG)
## get validation dataset
X_val = np.concatenate((X_val_daisy,
                    X_val_lum,
                    X_val_sat,
                    X_val_hue), axis=1)

In [34]:
## combine training data with labels
X_val_ = np.concatenate((X_val,np.array((validation_arrays[-1])).reshape(-1,1)), axis=1)
X_val_df = pd.DataFrame(X_val_)

# Daisy, Luminance, Saturation, Hue

X_val_df.to_csv(pca_dataset_dir+'validation_daisy_lum_sat_hue.csv')

In [61]:
X_val_df.shape

(187, 201)

In [51]:
!ls drive/MyDrive/Berkeley/w281/Fruit-and-Vegetable-Classification/modeling/pca_datasets/

training_daisy_lum_sat_hue.csv	validation_daisy_lum_sat_hue.csv


#### Making Testing set

In [52]:
test_arrays = process_img(test_set)

Processing image #: 0
Processing image #: 1
Processing image #: 2
Processing image #: 3
Processing image #: 4
Processing image #: 5
Processing image #: 6
Processing image #: 7
Processing image #: 8
Processing image #: 9
Processing image #: 10
Processing image #: 11
Processing image #: 12
Processing image #: 13
Processing image #: 14
Processing image #: 15
Processing image #: 16
Processing image #: 17
Processing image #: 18
Processing image #: 19
Processing image #: 20
Processing image #: 21
Processing image #: 22
Processing image #: 23
Processing image #: 24
Processing image #: 25
Processing image #: 26
Processing image #: 27
Processing image #: 28
Processing image #: 29
Processing image #: 30
Processing image #: 31
Processing image #: 32
Processing image #: 33
Processing image #: 34
Processing image #: 35
Processing image #: 36
Processing image #: 37
Processing image #: 38
Processing image #: 39
Processing image #: 40
Processing image #: 41
Processing image #: 42
Processing image #: 4



Processing image #: 159
Processing image #: 160
Processing image #: 161
Processing image #: 162
Processing image #: 163
Processing image #: 164
Processing image #: 165
Processing image #: 166
Processing image #: 167
Processing image #: 168
Processing image #: 169
Processing image #: 170
Processing image #: 171
Processing image #: 172
Processing image #: 173
Processing image #: 174
Processing image #: 175
Processing image #: 176
Processing image #: 177
Processing image #: 178
Processing image #: 179
Processing image #: 180
Processing image #: 181
Processing image #: 182
Processing image #: 183
Processing image #: 184
Processing image #: 185
Processing image #: 186


In [53]:

## scale daisy feature
scaled_test_daisy = daisy_scaler.fit_transform(test_arrays[7])

## scale luminance feature
scaled_test_lum = lum_scaler.fit_transform(test_arrays[3])

## scale saturation feature
scaled_test_sat = sat_scaler.fit_transform(test_arrays[2])

## scale hue feature
scaled_test_hue = hue_scaler.fit_transform(test_arrays[1])

In [54]:
RANDOM_SEED = 1234

# pca daisy feature
X_test_daisy = daisy_pca.fit_transform(scaled_test_daisy)

# pca luminance feature
X_test_lum = lum_pca.fit_transform(scaled_test_lum)

# pca saturation feature
X_test_sat = sat_pca.fit_transform(scaled_test_sat)

# pca hue feature
X_test_hue = hue_pca.fit_transform(scaled_test_hue)


In [55]:
X_test = np.concatenate((X_test_daisy,
                    X_test_lum,
                    X_test_sat,
                    X_test_hue), axis=1)

In [60]:
## combine training data with labels
X_test_ = np.concatenate((X_test,np.array((test_arrays[-1])).reshape(-1,1)), axis=1)
X_test_df = pd.DataFrame(X_test_)

# Daisy, Luminance, Saturation, Hue

X_test_df.to_csv(pca_dataset_dir+'test_daisy_lum_sat_hue.csv')