In [2]:
import os
import shutil
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Data Preparation

In [3]:
# Dataset path
plant_diseases_path = "//Users/firmansyahsundana/Documents/research/computer_science/plant-diseases-classification/glcm_plant_diseases_dataset.csv"
plant_diseases_test_path = "//Users/firmansyahsundana/Documents/research/computer_science/plant-diseases-classification/glcm_plant_diseases_test_dataset.csv"

## Load Dataset

In [4]:
df_train = pd.read_csv(plant_diseases_path) # Load train data
df_train.head()

Unnamed: 0.1,Unnamed: 0,dissimilarity_0,dissimilarity_45,dissimilarity_90,dissimilarity_135,correlation_0,correlation_45,correlation_90,correlation_135,homogeneity_0,...,contrast_135,ASM_0,ASM_45,ASM_90,ASM_135,energy_0,energy_45,energy_90,energy_135,label
0,0,26.078888,29.719781,30.291101,29.058736,0.732233,0.648406,0.632057,0.660605,0.052624,...,1773.189783,8.5e-05,7.7e-05,7.7e-05,7.8e-05,0.009218,0.008773,0.008762,0.008806,Strawberry___healthy
1,1,27.608612,29.424619,28.767057,29.325003,0.75659,0.73081,0.742741,0.72997,0.162452,...,2204.405596,0.011748,0.010511,0.011895,0.010564,0.108389,0.102523,0.109066,0.10278,Strawberry___healthy
2,2,26.030441,26.325476,25.095353,25.727686,0.600527,0.595977,0.632399,0.615211,0.049289,...,1291.368119,8.8e-05,8.6e-05,8.8e-05,8.6e-05,0.00937,0.009265,0.009379,0.009301,Strawberry___healthy
3,3,22.615802,24.507716,23.441826,23.408132,0.783122,0.754102,0.773607,0.769547,0.167986,...,1370.73271,0.012138,0.010932,0.012201,0.011084,0.110171,0.104555,0.110457,0.105282,Strawberry___healthy
4,4,28.360153,29.309177,28.974057,29.479907,0.738818,0.726535,0.728838,0.716637,0.165986,...,2274.678005,0.012224,0.01102,0.012521,0.01114,0.110563,0.104974,0.111899,0.105548,Strawberry___healthy


In [5]:
df_test = pd.read_csv(plant_diseases_test_path) # Load test data
df_test.head()

Unnamed: 0.1,Unnamed: 0,dissimilarity_0,dissimilarity_45,dissimilarity_90,dissimilarity_135,correlation_0,correlation_45,correlation_90,correlation_135,homogeneity_0,...,contrast_135,ASM_0,ASM_45,ASM_90,ASM_135,energy_0,energy_45,energy_90,energy_135,label
0,0,25.277359,26.979655,25.85066,25.510551,0.597469,0.548691,0.578126,0.594143,0.051053,...,1274.61215,9.4e-05,9.2e-05,9.6e-05,9.3e-05,0.009716,0.00959,0.009808,0.009662,Strawberry___healthy
1,1,21.082716,23.057634,23.533024,23.777778,0.78864,0.731416,0.717956,0.716353,0.080415,...,1262.496756,0.000129,0.000117,0.000118,0.000116,0.011339,0.010835,0.010852,0.010769,Strawberry___healthy
2,2,22.908258,22.557555,21.617234,23.444602,0.721122,0.728292,0.744362,0.704842,0.068138,...,1221.181784,0.000121,0.000117,0.000121,0.000116,0.01099,0.010821,0.011019,0.010793,Strawberry___healthy
3,3,25.961762,25.814831,23.771399,25.607159,0.732911,0.734962,0.763904,0.731118,0.159589,...,1679.594687,0.011846,0.010817,0.011796,0.010644,0.108839,0.104006,0.108611,0.103171,Strawberry___healthy
4,4,24.871903,24.400857,24.69248,25.713924,0.605828,0.621823,0.607185,0.575886,0.06065,...,1351.438823,0.000115,0.000111,0.000112,0.00011,0.010722,0.010531,0.010572,0.010499,Strawberry___healthy


In [6]:
X_train = df_train.iloc[:,1:-1]
X_test = df_test.iloc[:,1:-1]

In [7]:
X_train.columns

Index(['dissimilarity_0', 'dissimilarity_45', 'dissimilarity_90',
       'dissimilarity_135', 'correlation_0', 'correlation_45',
       'correlation_90', 'correlation_135', 'homogeneity_0', 'homogeneity_45',
       'homogeneity_90', 'homogeneity_135', 'contrast_0', 'contrast_45',
       'contrast_90', 'contrast_135', 'ASM_0', 'ASM_45', 'ASM_90', 'ASM_135',
       'energy_0', 'energy_45', 'energy_90', 'energy_135'],
      dtype='object')

In [8]:
X_test.columns

Index(['dissimilarity_0', 'dissimilarity_45', 'dissimilarity_90',
       'dissimilarity_135', 'correlation_0', 'correlation_45',
       'correlation_90', 'correlation_135', 'homogeneity_0', 'homogeneity_45',
       'homogeneity_90', 'homogeneity_135', 'contrast_0', 'contrast_45',
       'contrast_90', 'contrast_135', 'ASM_0', 'ASM_45', 'ASM_90', 'ASM_135',
       'energy_0', 'energy_45', 'energy_90', 'energy_135'],
      dtype='object')

In [9]:
y_train = df_train.label
y_test = df_test.label

In [10]:
y_train

0        Strawberry___healthy
1        Strawberry___healthy
2        Strawberry___healthy
3        Strawberry___healthy
4        Strawberry___healthy
                 ...         
70290       Soybean___healthy
70291       Soybean___healthy
70292       Soybean___healthy
70293       Soybean___healthy
70294       Soybean___healthy
Name: label, Length: 70295, dtype: object

In [11]:
y_test

0        Strawberry___healthy
1        Strawberry___healthy
2        Strawberry___healthy
3        Strawberry___healthy
4        Strawberry___healthy
                 ...         
17567       Soybean___healthy
17568       Soybean___healthy
17569       Soybean___healthy
17570       Soybean___healthy
17571       Soybean___healthy
Name: label, Length: 17572, dtype: object

In [12]:
X_train.head()

Unnamed: 0,dissimilarity_0,dissimilarity_45,dissimilarity_90,dissimilarity_135,correlation_0,correlation_45,correlation_90,correlation_135,homogeneity_0,homogeneity_45,...,contrast_90,contrast_135,ASM_0,ASM_45,ASM_90,ASM_135,energy_0,energy_45,energy_90,energy_135
0,26.078888,29.719781,30.291101,29.058736,0.732233,0.648406,0.632057,0.660605,0.052624,0.046782,...,1911.066002,1773.189783,8.5e-05,7.7e-05,7.7e-05,7.8e-05,0.009218,0.008773,0.008762,0.008806
1,27.608612,29.424619,28.767057,29.325003,0.75659,0.73081,0.742741,0.72997,0.162452,0.15247,...,2129.446838,2204.405596,0.011748,0.010511,0.011895,0.010564,0.108389,0.102523,0.109066,0.10278
2,26.030441,26.325476,25.095353,25.727686,0.600527,0.595977,0.632399,0.615211,0.049289,0.049763,...,1231.657013,1291.368119,8.8e-05,8.6e-05,8.8e-05,8.6e-05,0.00937,0.009265,0.009379,0.009301
3,22.615802,24.507716,23.441826,23.408132,0.783122,0.754102,0.773607,0.769547,0.167986,0.157902,...,1370.528698,1370.73271,0.012138,0.010932,0.012201,0.011084,0.110171,0.104555,0.110457,0.105282
4,28.360153,29.309177,28.974057,29.479907,0.738818,0.726535,0.728838,0.716637,0.165986,0.158304,...,2205.851952,2274.678005,0.012224,0.01102,0.012521,0.01114,0.110563,0.104974,0.111899,0.105548


In [13]:
# Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [14]:
X_train

array([[ 0.97858459,  1.37493799,  1.58920925, ..., -0.35408714,
        -0.35947886, -0.3538509 ],
       [ 1.20393585,  1.33259268,  1.36877843, ...,  0.87799134,
         0.91912978,  0.88059317],
       [ 0.97144765,  0.88797598,  0.83771998, ..., -0.34761649,
        -0.35162247, -0.34735829],
       ...,
       [ 0.79401225,  0.94893886,  0.76912755, ..., -0.33477364,
        -0.33797217, -0.33293613],
       [ 0.00783913, -0.14877017, -0.05070656, ..., -0.24766527,
        -0.25233819, -0.24439758],
       [ 0.49652356,  0.47710857,  0.53154693, ..., -0.32529772,
        -0.33107657, -0.32575627]])

# Feature Selection

## Principal Component Analysis

In [15]:
from sklearn.decomposition import PCA

for i in range(0, 24):
    pca = PCA(n_components=i)
    pca.fit_transform(X_train)

    print(f"Sum of Variance Ratio in {str(i)} Components: {sum(pca.explained_variance_ratio_)}")

Sum of Variance Ratio in 0 Components: 0
Sum of Variance Ratio in 1 Components: 0.5021303085854295
Sum of Variance Ratio in 2 Components: 0.8640337606474621
Sum of Variance Ratio in 3 Components: 0.9548978063517575
Sum of Variance Ratio in 4 Components: 0.9788397040991675
Sum of Variance Ratio in 5 Components: 0.985865028993289
Sum of Variance Ratio in 6 Components: 0.9920279824202791
Sum of Variance Ratio in 7 Components: 0.9969504484087754
Sum of Variance Ratio in 8 Components: 0.9980221307910482
Sum of Variance Ratio in 9 Components: 0.9986743710376036
Sum of Variance Ratio in 10 Components: 0.9990907012789609
Sum of Variance Ratio in 11 Components: 0.9994623664925856
Sum of Variance Ratio in 12 Components: 0.9996689443376908
Sum of Variance Ratio in 13 Components: 0.9997938034765984
Sum of Variance Ratio in 14 Components: 0.9998469936322447
Sum of Variance Ratio in 15 Components: 0.9998983427288125
Sum of Variance Ratio in 16 Components: 0.9999364427083565
Sum of Variance Ratio in 

In [16]:
pca = PCA(n_components=3)
X_train = pca.fit_transform(X_train)

print(sum(pca.explained_variance_ratio_))

0.9548978063517586


In [21]:
X_test = pca.fit_transform(X_test)

print(sum(pca.explained_variance_ratio_))

0.9565327016967404


# Bulid Model

## SVM

In [22]:
# Build model
svm = SVC(kernel='linear', decision_function_shape='ovr', random_state=42)

In [23]:
# Train data
svm.fit(X_train, y_train)

In [24]:
# Make prediction
y_pred = svm.predict(X_test)


In [25]:
print(classification_report(y_test, y_pred))

                                                    precision    recall  f1-score   support

                                Apple___Apple_scab       0.08      0.01      0.02       504
                                 Apple___Black_rot       0.18      0.16      0.17       497
                          Apple___Cedar_apple_rust       0.37      0.31      0.34       440
                                   Apple___healthy       0.18      0.24      0.20       502
                               Blueberry___healthy       0.08      0.20      0.12       454
          Cherry_(including_sour)___Powdery_mildew       0.14      0.10      0.12       421
                 Cherry_(including_sour)___healthy       0.19      0.38      0.26       456
Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot       0.00      0.00      0.00       410
                       Corn_(maize)___Common_rust_       0.78      0.74      0.76       477
               Corn_(maize)___Northern_Leaf_Blight       0.18      0.30      0.

## Naive Bayes

In [28]:
# Scaling Data Min Max 0 - 1
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train)
X_train = min_max_scaler.transform(X_train)

In [29]:
# Build naive bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [30]:
# Make prediction
y_nb_pred = nb.predict(X_test)

In [34]:
print(classification_report(y_test, y_nb_pred))

                                                    precision    recall  f1-score   support

                                Apple___Apple_scab       0.00      0.00      0.00       504
                                 Apple___Black_rot       0.00      0.00      0.00       497
                          Apple___Cedar_apple_rust       0.50      0.00      0.00       440
                                   Apple___healthy       0.00      0.00      0.00       502
                               Blueberry___healthy       0.00      0.00      0.00       454
          Cherry_(including_sour)___Powdery_mildew       0.00      0.00      0.00       421
                 Cherry_(including_sour)___healthy       0.00      0.00      0.00       456
Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot       0.00      0.00      0.00       410
                       Corn_(maize)___Common_rust_       0.22      0.91      0.36       477
               Corn_(maize)___Northern_Leaf_Blight       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Evaluation

In [71]:
# Evaluate model
svm_acc = accuracy_score(y_test, y_pred)
svm_mae = mean_absolute_error(y_test, y_pred)
print("SVM Accuracy: ", svm_acc)
print("MAE SVM: ", svm_mae)
print("F1 Score SVM: ", f1_score(y_test, y_pred, average="weighted"))

nb_acc = accuracy_score(y_test, y_nb_pred)
nb_mae = mean_absolute_error(y_test, y_nb_pred)
print("Naive Bayes Accuracy: ", nb_acc)
print("MAE NB: ", nb_mae)
print("F1 Score NB: ", f1_score(y_test, y_nb_pred, average="weighted"))

SVM Accuracy:  0.3894832688367858
MAE SVM:  7.112508536307763
F1 Score SVM:  0.37775633582351215


ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.