# Recreating cassava disease detection using an SVM

In [1]:
# --- Cell 1: Imports and File Paths ---
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set file paths for the E8 Trial (Trial 3)
UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat'
UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'

print("Libraries imported and paths defined.")

Libraries imported and paths defined.


In [2]:
# --- Simple, Direct Test ---
import scipy.io
import numpy as np

# We will use the UCBSV file for this test as we have seen its structure in MATLAB
UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'

print(f"Loading file: {UCBSV_PATH}")

# Load the data. squeeze_me=True simplifies the nested structure.
mat_data = scipy.io.loadmat(UCBSV_PATH, squeeze_me=True)

# The 'Patch' key contains an array of struct-like objects.
# In this file, it's an array of 18 structs.
all_scans = mat_data['Patch']

# Access the first leaf scan's struct from the array
first_scan_struct = all_scans[0]

# From that single struct, access the 'mean_values' field.
# This is the pre-calculated 9x14 matrix of features seen in MATLAB.
mean_values_for_first_leaf = first_scan_struct['mean_values']

print("\n--- Verification Successful ---")
print(f"Type of extracted data: {type(mean_values_for_first_leaf)}")
print(f"Shape of extracted data: {mean_values_for_first_leaf.shape}")
print("\nMean values for the 9 patches of the first leaf:")
print(mean_values_for_first_leaf)

Loading file: data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat

--- Verification Successful ---
Type of extracted data: <class 'numpy.ndarray'>
Shape of extracted data: ()

Mean values for the 9 patches of the first leaf:
[[ 18.26475694  17.18098958  17.80555556  25.25694444  24.59548611
   24.61458333  25.88975694  23.25651042  23.6171875   21.74262153
   19.94270833  24.29123264  24.03776042 126.12022569]
 [ 19.30425347  18.21831597  18.87239583  25.87934028  25.06987847
   25.06336806  26.22135417  23.58810764  24.19704861  22.32855903
   20.63671875  24.69661458  24.30555556 123.49262153]
 [ 18.93142361  17.78862847  18.63888889  26.63758681  26.12847222
   25.92491319  27.57161458  24.67751736  25.06857639  23.02039931
   21.27647569  25.41840278  25.09635417 128.41710069]
 [ 16.72395833  16.09722222  15.64453125  20.52300347  19.93663194
   20.49869792  21.06814236  18.90625     19.37717014  17.

In [3]:
# --- Cell 2: Process the INFECTED (UCBSV) File ---
print(f"Processing file: {UCBSV_PATH}")

# Load the .mat file for the infected class. 
# 'squeeze_me=True' helps simplify the nested data structure loaded by scipy.
mat_data_ucbsv = scipy.io.loadmat(UCBSV_PATH, squeeze_me=True)

# Access the 'Patch' field, which contains an array of structs, one for each leaf scan.
all_scans_ucbsv = mat_data_ucbsv['Patch']

# Initialize a list to store the feature vectors for this class.
ucbsv_features = []

# Iterate through each leaf scan's struct.
for scan_struct in all_scans_ucbsv:
    # Extract the pre-calculated 'mean_values' from the struct.
    # .item() is used to retrieve the numpy array from its 0-d container.
    mean_vals_matrix = scan_struct['mean_values'].item()
    
    # The matrix is 9x14, where each row is a feature vector for one patch.
    # Add all 9 feature vectors to the list for this class.
    ucbsv_features.extend(mean_vals_matrix)

# Print a confirmation with the total number of patches processed.
print(f"Successfully processed {len(ucbsv_features)} patches from the UCBSV file.")

Processing file: data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat
Successfully processed 162 patches from the UCBSV file.


In [4]:
# --- Cell 3: Process the UNINFECTED File ---
print(f"Processing file: {UNINFECTED_PATH}")

# Load the .mat file for the uninfected class.
mat_data_uninfected = scipy.io.loadmat(UNINFECTED_PATH, squeeze_me=True)

# Access the 'Patch' field, containing an array of structs for each leaf scan.
all_scans_uninfected = mat_data_uninfected['Patch']

# Initialize a list to store the feature vectors for this class.
uninfected_features = []

# Iterate through each leaf scan's struct.
for scan_struct in all_scans_uninfected:
    # Extract the pre-calculated 'mean_values' from the struct.
    # .item() is used to retrieve the numpy array from its 0-d container.
    mean_vals_matrix = scan_struct['mean_values'].item()
    
    # The matrix is 9x14, where each row is a feature vector for one patch.
    # Add all 9 feature vectors to the list for this class.
    uninfected_features.extend(mean_vals_matrix)

# Print a confirmation with the total number of patches processed.
print(f"Successfully processed {len(uninfected_features)} patches from the Uninfected file.")

Processing file: data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat
Successfully processed 162 patches from the Uninfected file.


In [5]:
# --- Cell 4: Combine Data and Finalize ---

# Create numerical labels for each class (0 for uninfected, 1 for infected).
uninfected_labels = [0] * len(uninfected_features)
ucbsv_labels = [1] * len(ucbsv_features)

# Combine the features and labels from both files into final numpy arrays.
X = np.array(uninfected_features + ucbsv_features)
y = np.array(uninfected_labels + ucbsv_labels)

# Print the final shapes to confirm the dataset is correctly assembled.
print(f"\nProcessing complete.")
print(f"Final feature matrix X shape: {X.shape}")
print(f"Final label vector y shape: {y.shape}")

# For easier inspection, create a pandas DataFrame.
# Define column names based on the wavelengths from the research paper.
wavelengths = [395, 415, 470, 528, 532, 550, 570, 585, 590, 610, 625, 640, 660, 880]
columns = [f'wl_{wl}' for wl in wavelengths]
df = pd.DataFrame(X, columns=columns)
df['label'] = y

# Display the top and bottom of the final DataFrame to verify its contents.
print("\n--- Final Dataset Head ---")
display(df.head())
print("\n--- Final Dataset Tail ---")
display(df.tail())


Processing complete.
Final feature matrix X shape: (324, 14)
Final label vector y shape: (324,)

--- Final Dataset Head ---


Unnamed: 0,wl_395,wl_415,wl_470,wl_528,wl_532,wl_550,wl_570,wl_585,wl_590,wl_610,wl_625,wl_640,wl_660,wl_880,label
0,19.703559,18.059462,16.894531,27.229601,26.131076,24.549045,28.139323,24.205295,25.448785,22.825955,20.521267,22.292969,21.75217,107.310764,0
1,17.602865,16.409722,15.253038,25.079427,24.518229,22.747396,26.451389,23.542969,24.570312,22.351997,19.956597,20.965278,20.43316,92.05816,0
2,18.036892,16.857639,15.898438,25.563368,24.887587,23.215712,26.603299,23.303819,24.248264,21.84592,19.69401,21.176649,20.75217,96.516059,0
3,19.746962,17.976128,17.099826,27.790365,26.640191,24.930122,28.705295,24.440104,25.649306,22.872396,20.459201,22.441406,22.03342,110.956163,0
4,18.998264,17.991319,16.598958,26.419271,25.56901,24.216146,27.644097,25.355903,25.124566,22.956597,20.721354,22.669705,22.957899,110.958333,0



--- Final Dataset Tail ---


Unnamed: 0,wl_395,wl_415,wl_470,wl_528,wl_532,wl_550,wl_570,wl_585,wl_590,wl_610,wl_625,wl_640,wl_660,wl_880,label
319,18.399414,17.708984,17.836914,25.652344,25.061523,24.613281,26.552734,24.635742,24.850586,23.331055,21.224609,23.93457,23.666016,110.960938,1
320,16.695312,16.180664,16.449219,23.918945,23.214844,22.682617,24.47168,21.979492,22.477539,20.779297,19.007812,21.821289,21.415039,101.84082,1
321,17.804688,16.208008,17.116211,23.941406,23.110352,23.979492,24.686523,24.301758,22.676758,21.613281,19.517578,24.28418,25.083984,136.611328,1
322,18.318359,16.760742,17.037109,23.227539,22.474609,23.113281,23.661133,22.624023,21.751953,20.505859,18.880859,22.994141,23.229492,118.53418,1
323,18.838867,16.842773,16.708008,23.207031,22.743164,22.996094,23.920898,21.993164,22.583984,20.851562,19.057617,22.569336,22.363281,110.27832,1


In [6]:
# --- Cell 5: Prepare Data for SVM (Scaling & Grouping) ---
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV
from sklearn.svm import SVC

# 1. Scale the features
# StandardScaler transforms the data to have a mean of 0 and standard deviation of 1.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Create the 'groups' array for Leave-One-Leaf-Out Cross-Validation
# We have 18 uninfected scans and 18 infected scans, each with 9 patches.
num_leaves_per_class = 18
num_patches_per_leaf = 9
num_total_leaves = num_leaves_per_class * 2 # 36 total leaves

# Create an array where each patch is assigned a group ID (0 to 35) corresponding to its leaf.
# e.g., [0,0,0,0,0,0,0,0,0, 1,1,1,..., 35,35,35]
groups = np.repeat(np.arange(num_total_leaves), num_patches_per_leaf)

print("Features scaled successfully.")
print(f"Groups array created with shape: {groups.shape}. Unique groups: {len(np.unique(groups))}")

Features scaled successfully.
Groups array created with shape: (324,). Unique groups: 36


In [7]:
# --- Cell 6: Setup and Run SVM with Grid Search Cross-Validation ---

# 1. Define the parameter grid for the SVM
# These are the hyperparameters we want to test.
param_grid = {
    'C': [1, 10, 100], 
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf'] # Using the Radial Basis Function kernel
}

# 2. Define the cross-validation strategy
# LeaveOneGroupOut() will perform the "leave-one-leaf-out" evaluation from the paper.
logo = LeaveOneGroupOut()

# 3. Create the GridSearchCV object
# This will test all hyperparameter combinations using our specific cross-validation strategy.
# 'n_jobs=-1' uses all available CPU cores to speed up the process.
svm_grid_search = GridSearchCV(
    estimator=SVC(), 
    param_grid=param_grid, 
    cv=logo, 
    scoring='accuracy',
    n_jobs=-1
)

# 4. Run the Grid Search
# training 36 different models for each hyperparameter combination.
print("Starting Grid Search with Leave-One-Group-Out Cross-Validation...")
svm_grid_search.fit(X_scaled, y, groups=groups)
print("Grid Search complete.")

# 5. Display the results
print("\n--- Results ---")
print(f"Best Parameters Found: {svm_grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {svm_grid_search.best_score_ * 100:.2f}%")

print(f"\nTarget accuracy from paper: 90.8 ± 11.3%")

Starting Grid Search with Leave-One-Group-Out Cross-Validation...
Grid Search complete.

--- Results ---
Best Parameters Found: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best Cross-Validation Accuracy: 87.35%

Target accuracy from paper: 90.8 ± 11.3%


In [8]:
# --- Cell 7: Analyze the Worst Performing Model ---

# The results for all runs are stored in the .cv_results_ attribute
cv_results = svm_grid_search.cv_results_

# Find the index (position) of the combination with the lowest average score
worst_score_index = np.argmin(cv_results['mean_test_score'])

# Use that index to get the worst score and the parameters that caused it
worst_accuracy = cv_results['mean_test_score'][worst_score_index]
worst_parameters = cv_results['params'][worst_score_index]

print("--- Worst Performer Analysis ---")
print(f"Worst Cross-Validation Accuracy: {worst_accuracy * 100:.2f}%")
print(f"Parameters for Worst Performer: {worst_parameters}")

--- Worst Performer Analysis ---
Worst Cross-Validation Accuracy: 57.10%
Parameters for Worst Performer: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}


# Further Experiments: Performing cross trials of data

In [1]:
import scipy.io
import numpy as np
import os

def load_and_prepare_data(base_path, trial_name, class_name, date_folder, file_id):
    """
    Loads and prepares spectral data from a single .mat file.

    Args:
        base_path (str): The root path to the dataset (e.g., 'data/').
        trial_name (str): The trial identifier (e.g., 'E8-TME204').
        class_name (str): The class folder ('Uninfected' or 'UCBSV').
        date_folder (str): The specific date folder for the trial.
        file_id (str): The unique identifier part of the .mat file name.

    Returns:
        tuple: A tuple containing:
            - np.array: A NumPy array of the feature vectors.
            - int: The number of leaves (scans) found in the file.
    """
    # --- THIS IS THE CORRECTED PART ---
    # Construct the filename and then the full path
    file_name = f"{trial_name.split('-')[0]}_TME204_28dpi_{date_folder.split('_')[-1]}_{file_id}.mat"
    full_path = os.path.join(base_path, trial_name, date_folder, class_name, file_name)
    
    # Load the .mat file
    mat_data = scipy.io.loadmat(full_path, squeeze_me=True)
    
    # Extract the 'Patch' data
    all_scans = mat_data['Patch']
    
    # Handle case where there's only one scan
    if all_scans.ndim == 0:
        all_scans = [all_scans]
        
    num_leaves = len(all_scans)
    
    # Extract mean values from each scan
    features = []
    for scan_struct in all_scans:
        mean_vals_matrix = scan_struct['mean_values'].item()
        features.extend(mean_vals_matrix)
        
    return np.array(features), num_leaves

In [4]:
# --- Define Dataset Paths ---

BASE_DATA_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/'

# E4 Trial Details
E4_TRIAL = 'E4-TME204'
E4_DATE = 'E4_TME204_28dpi_Mar_5_2020'
E4_UNINFECTED_ID = 'Uninfected_11_06_23_18'
E4_UCBSV_ID = 'UCBSV_11_06_23_21'

# E6 Trial Details
E6_TRIAL = 'E6-TME204'
E6_DATE = 'E6_TME204_28dpi_Jul_29_2020'
E6_UNINFECTED_ID = 'Uninfected_11_06_23_29'
E6_UCBSV_ID = 'UCBSV_11_06_23_24'

# E8 Trial Details
E8_TRIAL = 'E8-TME204'
E8_DATE = 'E8_TME204_28dpi_Dec_16_2020'
E8_UNINFECTED_ID = 'Uninfected_11_06_23_33'
E8_UCBSV_ID = 'UCBSV_11_06_23_36'

# Experiment 1: Train on E8, test on E6

In [None]:
# --- Load E8 Training Data ---
e8_uninfected_features, _ = load_and_prepare_data(BASE_DATA_PATH, E8_TRIAL, 'Uninfected', E8_DATE, E8_UNINFECTED_ID)
e8_ucbsv_features, _ = load_and_prepare_data(BASE_DATA_PATH, E8_TRIAL, 'UCBSV', E8_DATE, E8_UCBSV_ID)

# Create labels (0 for uninfected, 1 for infected)
y_train_e8 = np.concatenate([np.zeros(len(e8_uninfected_features)), np.ones(len(e8_ucbsv_features))])
X_train_e8 = np.concatenate([e8_uninfected_features, e8_ucbsv_features])

print(f"Training data (E8) loaded. X shape: {X_train_e8.shape}, y shape: {y_train_e8.shape}")

In [None]:
# --- Load E6 Testing Data ---
e6_uninfected_features, num_uninfected_leaves_e6 = load_and_prepare_data(BASE_DATA_PATH, E6_TRIAL, 'Uninfected', E6_DATE, E6_UNINFECTED_ID)
e6_ucbsv_features, num_ucbsv_leaves_e6 = load_and_prepare_data(BASE_DATA_PATH, E6_TRIAL, 'UCBSV', E6_DATE, E6_UCBSV_ID)

# Create labels
y_test_e6 = np.concatenate([np.zeros(len(e6_uninfected_features)), np.ones(len(e6_ucbsv_features))])
X_test_e6 = np.concatenate([e6_uninfected_features, e6_ucbsv_features])

print(f"Testing data (E6) loaded. X shape: {X_test_e6.shape}, y shape: {y_test_e6.shape}")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# --- Train and Evaluate ---

# 1. Scale the data (fit on training data, transform both)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_e8)
X_test_scaled = scaler.transform(X_test_e6)

# 2. Train the SVM using the best parameters from your previous work
svm = SVC(C=100, gamma=0.01, kernel='rbf')
svm.fit(X_train_scaled, y_train_e8)

# 3. Predict on the patch level
y_pred_patch = svm.predict(X_test_scaled)
patch_accuracy = accuracy_score(y_test_e6, y_pred_patch)
print(f"--- Results: Train E8 / Test E6 ---")
print(f"Overall Patch-Based Accuracy: {patch_accuracy * 100:.2f}%")

# 4. Implement and evaluate using Majority Voting per leaf
num_patches_per_leaf = 9
y_pred_leaf = []
y_true_leaf = []

# Process Uninfected Leaves
for i in range(num_uninfected_leaves_e6):
    start = i * num_patches_per_leaf
    end = start + num_patches_per_leaf
    patch_predictions = y_pred_patch[start:end]
    # Predict for the leaf based on the mode of its patch predictions
    y_pred_leaf.append(np.bincount(patch_predictions.astype(int)).argmax())
    y_true_leaf.append(0) # True label is Uninfected

# Process Infected Leaves
offset = len(e6_uninfected_features)
for i in range(num_ucbsv_leaves_e6):
    start = offset + (i * num_patches_per_leaf)
    end = start + num_patches_per_leaf
    patch_predictions = y_pred_patch[start:end]
    y_pred_leaf.append(np.bincount(patch_predictions.astype(int)).argmax())
    y_true_leaf.append(1) # True label is Infected
    
leaf_accuracy = accuracy_score(y_true_leaf, y_pred_leaf)
print(f"Leaf-Based Accuracy (Majority Vote): {leaf_accuracy * 100:.2f}%")