In [1]:
import numpy as np
import pandas as pd
import cv2
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# Data Preparation

In [2]:
# Create dictionary for category classes
output_dict = {0 : 'aloevera',
                1 : 'banana',
                2 : 'bilimbi',
                3 : 'cantaloupe',
                4 : 'cassava',
                5 : 'coconut',
                6 : 'corn',
                7 : 'cucumber',
                8 : 'curcuma',
                9 : 'eggplant',
                10 : 'galangal',
                11 : 'ginger',
                12 : 'guava',
                13 : 'kale',
                14 : 'longbeans',
                15 : 'mango',
                16 : 'melon',
                17 : 'orange',
                18 : 'paddy',
                19 : 'papaya',
                20 : 'peperchili',
                21 : 'pineapple',
                22 : 'pomelo',
                23 : 'shallot',
                24 : 'soybeans',
                25 : 'spinach',
                26 : 'sweetpotatoes',
                27 : 'tobacco',
                28 : 'waterapple',
                29 : 'watermelon'}

In [3]:
def map_values(category_num):
    return output_dict.get(category_num)

In [4]:
# Load and preprocess images from file paths
def load_images(df):
    images = []
    for file_path in df['image:FILE']:
        # Read the image from file
        image = cv2.imread(file_path)

        # Resize the image
        image = cv2.resize(image, (32, 32))  # Adjust size as needed

        # Flatten the image to a 1D array
        flattened_image = image.flatten()

        # Append the flattened image to the list
        images.append(flattened_image)
    return np.array(images)

In [5]:
def prepare_data(file_path):
    # Read the csv file
    df = pd.read_csv(file_path)

    # Trim dataset for faster testing for now
    #df = df.sample(n=1000, random_state=42)

    # Add new column for name of class category
    df['label'] = df['category'].apply(map_values)

    # Append data/ in front of every image file path
    df['image:FILE'] = 'data/' + df['image:FILE']
    
    # Load images
    X = load_images(df)

    # Convert images to numpy arrays
    X = np.array(X)
    y = np.array(df['label'])

    return X, y

In [6]:
X_train, y_train = prepare_data('data/test.csv')
X_val, y_val = prepare_data('data/val.csv')
X_test, y_test = prepare_data('data/test.csv')

In [7]:
# Combine the train and val sets and use cross validation instead so we can have a larger training set
X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

# Train Model

In [8]:
def finetune_parameters():
    model = SVC()

    param_grid = {'C': [1, 10, 100],
                  'kernel': ['linear', 'poly', 'rbf']}

    randomized_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=6, cv=10, random_state=42)
    randomized_search.fit(X_train, y_train)

    best_params = randomized_search.best_params_
    print("Best Parameters:", best_params)

    return best_params

In [9]:
def SVM_train():
    #best_params = finetune_parameters()

    best_model = SVC(kernel='rbf', C=10)
    best_model.fit(X_train, y_train)

    val_accuracy = cross_val_score(best_model, X_train, y_train, cv=10, scoring="accuracy")
    val_accuracy = np.mean(val_accuracy)
    print("Validation Accuracy:", val_accuracy)

    test_predictions = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print("Testing Accuracy:", test_accuracy)

In [11]:
SVM_train()

Validation Accuracy: 0.6531111111111112
Testing Accuracy: 0.9673333333333334
