<a href="https://colab.research.google.com/github/MichaelTay/w281-summer-2023-project/blob/main/VGG19_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from google.colab import drive

mountdir = '/content/drive/'
drive.mount(mountdir, force_remount=True)

localdir = mountdir + 'MyDrive'
w281_dir = '/Berkeley/w281/Fruit-and-Vegetable-Classification/'
inputdir = localdir + w281_dir
vggdir = inputdir + 'modeling/vgg_data/'

Mounted at /content/drive/


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import os.path
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix,
                             classification_report, auc, roc_curve,
                             RocCurveDisplay, accuracy_score)
from sklearn.model_selection import StratifiedKFold

from hyperopt import tpe, atpe, rand, hp, fmin, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from hyperopt.early_stop import no_progress_loss

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [3]:
import os
import cv2

# Create a list with the filepaths for training and testing
train_dir = Path(inputdir, './input/train')
train_filepaths = list(train_dir.glob(r'**/*.jpg'))

test_dir = Path(inputdir, './input/test')
test_filepaths = list(test_dir.glob(r'**/*.jpg'))

val_dir = Path(inputdir, './input/validation')
val_filepaths = list(test_dir.glob(r'**/*.jpg'))

def proc_img(filepath):
    """ Create a DataFrame with the filepath and the labels of the pictures
    """

    labels = [str(filepath[i]).split("/")[-2] \
              for i in range(len(filepath))]

    filepath = pd.Series(filepath, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # Concatenate filepaths and labels
    df = pd.concat([filepath, labels], axis=1)

    # Shuffle the DataFrame and reset index
    df = df.sample(frac=1).reset_index(drop = True)

    return df

train_df = proc_img(train_filepaths)
test_df = proc_img(test_filepaths)
val_df = proc_img(val_filepaths)

In [4]:
# Fruits - banana, apple, pear, grapes, orange, kiwi, watermelon, pomegranate, pineapple, mango.
# Vegetables - Bell Pepper, Cauliflower, Chilli Pepper, Peas, Corn, Spinach, Turnip, Garlic, Ginger, Cabbage
Fruits = ['banana', 'apple', 'pear', 'grapes', 'orange', 'kiwi', 'watermelon', 'pomegranate', 'pineapple', 'mango']
Vegetables = ['bell pepper', 'cauliflower', 'chilli pepper', 'peas', 'corn', 'spinach', 'turnip', 'garlic', 'ginger', 'cabbage']

train_df = train_df[train_df['Label'].isin(Fruits + Vegetables)]
test_df = test_df[test_df['Label'].isin(Fruits + Vegetables)]
val_df = val_df[val_df['Label'].isin(Fruits + Vegetables)]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print('-- Training set --\n')
print(f'Number of pictures: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Label.unique())}\n')
print(f'Labels: {train_df.Label.unique()}')

-- Training set --

Number of pictures: 1540

Number of different labels: 20

Labels: ['cabbage' 'grapes' 'mango' 'bell pepper' 'chilli pepper' 'orange'
 'cauliflower' 'apple' 'watermelon' 'banana' 'pineapple' 'kiwi' 'ginger'
 'turnip' 'peas' 'pomegranate' 'spinach' 'corn' 'garlic' 'pear']


In [5]:
train = pd.read_csv(vggdir + 'vgg_train.csv').drop('Unnamed: 0', axis=1)
test = pd.read_csv(vggdir + 'vgg_test.csv').drop('Unnamed: 0', axis=1)
validation = pd.read_csv(vggdir + 'vgg_validation.csv').drop('Unnamed: 0', axis=1)

train['label'] = train_df['Label']
train = train.drop('0.1', axis=1)
validation['label'] = val_df['Label']
validation = validation.drop('0.1', axis=1)
test['label'] = test_df['Label']
test = test.drop('0.1', axis=1)

In [9]:
X_train, y_train = train.iloc[:, 0:-1], train.iloc[:, -1]
X_val, y_val = validation.iloc[:, 0:-1], validation.iloc[:, -1]
X_test, y_test = test.iloc[:, 0:-1], test.iloc[:, -1]


## Modeling

In [6]:
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (roc_curve,
                             auc, RocCurveDisplay,
                             classification_report,
                             confusion_matrix,accuracy_score)
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score, f1_score, make_scorer, ConfusionMatrixDisplay
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials


def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#### Bayesian Parameter Search

In [35]:
space = {   'tol' : hp.uniform('tol', 0.00001, 0.0001),
            # 'kernel' : hp.choice('kernel', ['poly', 'rbf', 'sigmoid']),
            'C' : hp.uniform('C', 0.00001, 1000),
            'max_iter' : hp.choice('max_iter', range(100,1000))
                }

In [38]:
def optimize_lr(params):

    skf = StratifiedKFold(n_splits=10)
    clf = svm.SVC(**params,kernel='poly', degree=3)

    f1_weighted = cross_val_score(clf, X_train, y_train,
                         scoring=make_scorer(accuracy_score),
                        cv=skf).mean()
    best_score = np.mean(f1_weighted)
    loss = 1 - best_score
    return {"loss":loss, "status":STATUS_OK}

In [39]:
RANDOM_SEED = 1234
trials = Trials()

best = fmin(
    fn=optimize_lr,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    rstate=np.random.default_rng(RANDOM_SEED)
)

print("Best: {}".format(best))

100%|██████████| 100/100 [28:12<00:00, 16.93s/trial, best loss: 0.9448051948051948]
Best: {'C': 13.669577387382544, 'max_iter': 461, 'tol': 3.673213165801924e-05}


#### Validation set inference

In [41]:
params = {
    'C' : 13.669577387382544,
    'kernel' : 'poly',
    'max_iter' : 461,
    'tol' : 3.673213165801924e-05,
    'class_weight' : 'balanced',
    'probability' : True
}

In [42]:
sv_clf0 = svm.SVC(**params)
sv_clf0.fit(X_train, y_train)

y_pred = sv_clf0.predict(X_val)
y_proba = sv_clf0.predict_proba(X_val)

print(classification_report(y_val, y_pred))
print(accuracy_score(y_val, y_pred))


               precision    recall  f1-score   support

        apple       0.00      0.00      0.00         9
       banana       0.00      0.00      0.00         9
  bell pepper       0.00      0.00      0.00         9
      cabbage       0.00      0.00      0.00        10
  cauliflower       0.10      0.11      0.11         9
chilli pepper       0.00      0.00      0.00         7
         corn       0.00      0.00      0.00        10
       garlic       0.12      0.20      0.15        10
       ginger       0.00      0.00      0.00        10
       grapes       0.10      0.12      0.11         8
         kiwi       0.10      0.10      0.10        10
        mango       0.00      0.00      0.00        10
       orange       0.00      0.00      0.00         7
         pear       0.00      0.00      0.00        10
         peas       0.00      0.00      0.00         9
    pineapple       0.00      0.00      0.00        10
  pomegranate       0.00      0.00      0.00        10
      spi

#### Test set inference

In [43]:

y_pred_test = sv_clf0.predict(X_test)
y_proba = sv_clf0.predict_proba(X_test)

print(classification_report(y_test, y_pred_test))
print(accuracy_score(y_test, y_pred_test))


               precision    recall  f1-score   support

        apple       0.00      0.00      0.00         9
       banana       0.00      0.00      0.00         9
  bell pepper       0.00      0.00      0.00         9
      cabbage       0.00      0.00      0.00        10
  cauliflower       0.00      0.00      0.00         9
chilli pepper       0.18      0.29      0.22         7
         corn       0.00      0.00      0.00        10
       garlic       0.06      0.10      0.07        10
       ginger       0.00      0.00      0.00        10
       grapes       0.00      0.00      0.00         8
         kiwi       0.20      0.20      0.20        10
        mango       0.29      0.20      0.24        10
       orange       0.09      0.14      0.11         7
         pear       0.00      0.00      0.00        10
         peas       0.50      0.11      0.18         9
    pineapple       0.00      0.00      0.00        10
  pomegranate       0.00      0.00      0.00        10
      spi