In [31]:
import numpy as np
import pandas as pd
import cv2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler 


In [32]:
# Create dictionary for category classes
output_dict = {0 : 'aloevera',
                1 : 'banana',
                2 : 'bilimbi',
                3 : 'cantaloupe',
                4 : 'cassava',
                5 : 'coconut',
                6 : 'corn',
                7 : 'cucumber',
                8 : 'curcuma',
                9 : 'eggplant',
                10 : 'galangal',
                11 : 'ginger',
                12 : 'guava',
                13 : 'kale',
                14 : 'longbeans',
                15 : 'mango',
                16 : 'melon',
                17 : 'orange',
                18 : 'paddy',
                19 : 'papaya',
                20 : 'peperchili',
                21 : 'pineapple',
                22 : 'pomelo',
                23 : 'shallot',
                24 : 'soybeans',
                25 : 'spinach',
                26 : 'sweetpotatoes',
                27 : 'tobacco',
                28 : 'waterapple',
                29 : 'watermelon'}

In [33]:
def map_values(category_num):
    return output_dict.get(category_num)

In [34]:
# Load and preprocess images from file paths
def load_images(df):
    images = []
    for file_path in df['image:FILE']:
        # Read the image from file
        image = cv2.imread(file_path)

        # Resize the image
        image = cv2.resize(image, (32, 32))  # Adjust size as needed

        # Flatten the image to a 1D array
        flattened_image = image.flatten()

        # Append the flattened image to the list
        images.append(flattened_image)
    return np.array(images)

In [35]:
def prepare_data(file_path):
    # Read the csv file
    df = pd.read_csv(file_path)

    # Trim dataset for faster testing for now
    #df = df.sample(n=1000, random_state=42)

    # Add new column for name of class category
    df['label'] = df['category'].apply(map_values)

    # Append data/ in front of every image file path
    df['image:FILE'] = 'data/' + df['image:FILE']
    
    # Load images
    X = load_images(df)

    # Convert images to numpy arrays
    X = np.array(X)
    y = np.array(df['label'])

    return X, y

In [36]:
X_train, y_train = prepare_data('data/test.csv')
X_val, y_val = prepare_data('data/val.csv')
X_test, y_test = prepare_data('data/test.csv')

# Train Model

In [37]:
def finetune_parameters():
    model = KNeighborsClassifier()

    param_grid = {'n_neighbors': np.arange(1, 10),
                  'weights': ['uniform', 'distance'],
                  'metric': ['minkowski', 'euclidean']}

    randomized_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=6, cv=10, random_state=42)
    randomized_search.fit(X_train, y_train)

    best_params = randomized_search.best_params_
    print("Best Parameters:", best_params)

    # Get the best MLP model from the grid search
    best_model = randomized_search.best_estimator_

    val_predictions = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)
    print("Validation Accuracy:", val_accuracy)

    return best_params

In [38]:
def KNN_train(X_train, y_train, X_test, y_test):
    best_params = finetune_parameters()
    

    best_model = KNeighborsClassifier(**best_params)    
    best_model.fit(X_train, y_train)

    test_predictions = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print("Testing Accuracy:", test_accuracy)

    val_accuracy = cross_val_score(best_model, X_train, y_train, cv=10, scoring="accuracy")
    val_accuracy = np.mean(val_accuracy)

    # Get test accuracy
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Get confusion matrix
    confusion_mtrx = confusion_matrix(y_test, y_pred, normalize="true")

    return val_accuracy, test_accuracy, confusion_mtrx

In [39]:
def print_clf_results(val_accuracy, test_accuracy, confusion_mtrx, dataset_name):
    print(f"Performance of Multilayer Perceptron Classification on {dataset_name}:")
    print("Cross Validation Accuracy = ", val_accuracy)
    print("Test Accuracy = ", test_accuracy)
    print("Confusion Matrix:")
    print(confusion_mtrx.round(decimals=3)) # Round to 3 decimal places

In [40]:
val_acc_final, test_acc_final, confusion_mtrx_final = KNN_train(X_train, y_train, X_test, y_test)
print_clf_results(val_acc_final, test_acc_final, confusion_mtrx_final, "Final Dataset")

Best Parameters: {'weights': 'distance', 'n_neighbors': 7, 'metric': 'minkowski'}
Validation Accuracy: 0.443
Testing Accuracy: 0.9696666666666667
Performance of Multilayer Perceptron Classification on Final Dataset:
Cross Validation Accuracy =  0.5123333333333333
Test Accuracy =  0.9696666666666667
Confusion Matrix:
[[1.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.   ]
 [0.    1.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.   ]
 [0.    0.    1.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.   ]
 [0.    0.    0.    1.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    