# Training on E8, testing on E11

In [1]:
import scipy.io
import numpy as np
import os
import random
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Define Dataset Paths ---
E8_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat'
E8_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'
E11_UNINFECTED_PATH = 'data/TME204-Patch_E11_E12_E16_E18_28DPI_Dataset/E11-TME204/E11_TME204_28dpi_Apr_22_2022/Uninfected/E11_TME204_28dpi_Apr_22_2022_Uninfected_18_06_19_35.mat'
E11_UCBSV_PATH = 'data/TME204-Patch_E11_E12_E16_E18_28DPI_Dataset/E11-TME204/E11_TME204_28dpi_Apr_22_2022/UCBSV/E11_TME204_28dpi_Apr_22_2022_UCBSV_18_06_20_21.mat'
print("File paths defined.")

def load_scans_from_mat(file_path):
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    if all_scans_struct.ndim == 0:
        all_scans_struct = np.array([all_scans_struct])
    scans_data = []
    for scan_struct in all_scans_struct:
        mean_vals = scan_struct['mean_values']
        if mean_vals.ndim == 0:
            scans_data.append(mean_vals.item())
        else:
            scans_data.append(mean_vals)
    return scans_data

print("Data loading function defined.")

File paths defined.
Data loading function defined.


In [3]:
# --- Configuration ---
N_REPEATS = 100
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
CV_FOLDS = 20  # 34 scans for training/validation, 20-fold CV

# --- Data Loading (once at the beginning) ---
e8_uninfected_scans = load_scans_from_mat(E8_UNINFECTED_PATH)
e8_ucbsv_scans = load_scans_from_mat(E8_UCBSV_PATH)
e11_uninfected_scans = load_scans_from_mat(E11_UNINFECTED_PATH)
e11_ucbsv_scans = load_scans_from_mat(E11_UCBSV_PATH)
print(f'Loaded E8 data: {len(e8_uninfected_scans)} uninfected, {len(e8_ucbsv_scans)} infected.')
print(f'Loaded E11 data: {len(e11_uninfected_scans)} uninfected, {len(e11_ucbsv_scans)} infected.')

# Prepare the full E11 test set (all scans, all patches)
X_test_e11 = np.concatenate(e11_uninfected_scans + e11_ucbsv_scans)
y_test_e11 = np.concatenate([np.full(len(e11_uninfected_scans) * 9, 0), np.full(len(e11_ucbsv_scans) * 9, 1)])

# --- Result Storage ---
e8_holdout_accuracies = []
e11_external_accuracies = []
best_params_list = []

print(f'Starting {N_REPEATS} iterations of the experiment...')

Loaded E8 data: 18 uninfected, 18 infected.
Loaded E11 data: 6 uninfected, 6 infected.
Starting 100 iterations of the experiment...


In [4]:
# --- Main Loop ---
for i in range(N_REPEATS):
    print(f'--- Iteration {i + 1}/{N_REPEATS} ---')
    # 1. Split E8 data: Isolate 1 of each class for hold-out testing
    random.shuffle(e8_uninfected_scans)
    random.shuffle(e8_ucbsv_scans)
    e8_test_scans = [e8_uninfected_scans[0], e8_ucbsv_scans[0]]
    e8_train_val_scans = e8_uninfected_scans[1:] + e8_ucbsv_scans[1:]
    # 2. Prepare training/validation and E8-holdout sets
    X_train_val = np.concatenate(e8_train_val_scans)
    y_train_val = np.concatenate([np.full(len(e8_uninfected_scans[1:]) * 9, 0), np.full(len(e8_ucbsv_scans[1:]) * 9, 1)])
    X_test_e8 = np.concatenate(e8_test_scans)
    y_test_e8 = np.concatenate([np.full(9, 0), np.full(9, 1)])
    # 3. Scale data: Fit ONLY on the training/validation data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_e8_scaled = scaler.transform(X_test_e8)
    X_test_e11_scaled = scaler.transform(X_test_e11)
    # 4. Hyperparameter tuning using GridSearchCV on the 34-scan set
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    svm = SVC()
    grid_search = GridSearchCV(svm, PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    best_params = grid_search.best_params_
    best_params_list.append(best_params)
    # 5. Train the final model on the entire 34-scan set with the best parameters
    final_model = SVC(**best_params).fit(X_train_val_scaled, y_train_val)
    # 6. Test and record accuracies
    # Test on E8 hold-out set
    y_pred_e8 = final_model.predict(X_test_e8_scaled)
    e8_holdout_accuracies.append(accuracy_score(y_test_e8, y_pred_e8))
    # Test on external E11 set
    y_pred_e11 = final_model.predict(X_test_e11_scaled)
    e11_external_accuracies.append(accuracy_score(y_test_e11, y_pred_e11))

--- Iteration 1/100 ---
--- Iteration 2/100 ---
--- Iteration 3/100 ---
--- Iteration 4/100 ---
--- Iteration 5/100 ---
--- Iteration 6/100 ---
--- Iteration 7/100 ---
--- Iteration 8/100 ---
--- Iteration 9/100 ---
--- Iteration 10/100 ---
--- Iteration 11/100 ---
--- Iteration 12/100 ---
--- Iteration 13/100 ---
--- Iteration 14/100 ---
--- Iteration 15/100 ---
--- Iteration 16/100 ---
--- Iteration 17/100 ---
--- Iteration 18/100 ---
--- Iteration 19/100 ---
--- Iteration 20/100 ---
--- Iteration 21/100 ---
--- Iteration 22/100 ---
--- Iteration 23/100 ---
--- Iteration 24/100 ---
--- Iteration 25/100 ---
--- Iteration 26/100 ---
--- Iteration 27/100 ---
--- Iteration 28/100 ---
--- Iteration 29/100 ---
--- Iteration 30/100 ---
--- Iteration 31/100 ---
--- Iteration 32/100 ---
--- Iteration 33/100 ---
--- Iteration 34/100 ---
--- Iteration 35/100 ---
--- Iteration 36/100 ---
--- Iteration 37/100 ---
--- Iteration 38/100 ---
--- Iteration 39/100 ---
--- Iteration 40/100 ---
--- Itera

In [5]:
# --- Analyze and Report Final Results ---
e8_holdout_accuracies = np.array(e8_holdout_accuracies)
e11_external_accuracies = np.array(e11_external_accuracies)

# E8 Hold-out Results
mean_e8 = np.mean(e8_holdout_accuracies)
std_e8 = np.std(e8_holdout_accuracies)
print('--- Test on E8 Hold-out Set (2 leaves) ---')
print(f'Method: In each of {N_REPEATS} iterations, 1 uninfected and 1 infected leaf from E8 were held out for testing.')
print(f'The model was trained on the remaining 34 leaves from E8.')
print(f'Mean Accuracy: {mean_e8 * 100:.2f}%')
print(f'Standard Deviation: {std_e8 * 100:.2f}%')
print('-' * 40)

# E11 External Test Results
mean_e11 = np.mean(e11_external_accuracies)
std_e11 = np.std(e11_external_accuracies)
print('\n--- Test on External E11 Dataset (12 leaves) ---')
print(f'Method: The same {N_REPEATS} models trained on E8 were tested against the full, unseen E11 dataset.')
print(f'Mean Accuracy: {mean_e11 * 100:.2f}%')
print(f'Standard Deviation: {std_e11 * 100:.2f}%')
print('-' * 40)

# Hyperparameter Analysis
print('\n--- Hyperparameter Selection ---')
params_df = pd.DataFrame(best_params_list)
print('Frequency of best parameters selected by GridSearchCV across all iterations:')
print(params_df.value_counts())

--- Test on E8 Hold-out Set (2 leaves) ---
Method: In each of 100 iterations, 1 uninfected and 1 infected leaf from E8 were held out for testing.
The model was trained on the remaining 34 leaves from E8.
Mean Accuracy: 87.28%
Standard Deviation: 16.58%
----------------------------------------

--- Test on External E11 Dataset (12 leaves) ---
Method: The same 100 models trained on E8 were tested against the full, unseen E11 dataset.
Mean Accuracy: 49.94%
Standard Deviation: 2.80%
----------------------------------------

--- Hyperparameter Selection ---
Frequency of best parameters selected by GridSearchCV across all iterations:
C    gamma  kernel
100  0.01   rbf       97
     0.10   rbf        2
10   0.10   rbf        1
Name: count, dtype: int64


# Training on E11, Testing on E8

In [6]:
# --- Define Dataset Paths ---
E8_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat'
E8_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'
E11_UNINFECTED_PATH = 'data/TME204-Patch_E11_E12_E16_E18_28DPI_Dataset/E11-TME204/E11_TME204_28dpi_Apr_22_2022/Uninfected/E11_TME204_28dpi_Apr_22_2022_Uninfected_18_06_19_35.mat'
E11_UCBSV_PATH = 'data/TME204-Patch_E11_E12_E16_E18_28DPI_Dataset/E11-TME204/E11_TME204_28dpi_Apr_22_2022/UCBSV/E11_TME204_28dpi_Apr_22_2022_UCBSV_18_06_20_21.mat'
print("File paths defined.")

def load_scans_from_mat(file_path):
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    if all_scans_struct.ndim == 0:
        all_scans_struct = np.array([all_scans_struct])
    scans_data = []
    for scan_struct in all_scans_struct:
        mean_vals = scan_struct['mean_values']
        if mean_vals.ndim == 0:
            scans_data.append(mean_vals.item())
        else:
            scans_data.append(mean_vals)
    return scans_data

print("Data loading function defined.")

File paths defined.
Data loading function defined.


In [7]:
# --- Configuration ---
N_REPEATS = 100
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
CV_FOLDS = 6  # 12 scans for training/validation, 6-fold CV

# --- Data Loading (once at the beginning) ---
e11_uninfected_scans = load_scans_from_mat(E11_UNINFECTED_PATH)
e11_ucbsv_scans = load_scans_from_mat(E11_UCBSV_PATH)
e8_uninfected_scans = load_scans_from_mat(E8_UNINFECTED_PATH)
e8_ucbsv_scans = load_scans_from_mat(E8_UCBSV_PATH)
print(f'Loaded E11 data: {len(e11_uninfected_scans)} uninfected, {len(e11_ucbsv_scans)} infected.')
print(f'Loaded E8 data: {len(e8_uninfected_scans)} uninfected, {len(e8_ucbsv_scans)} infected.')

# Prepare the full E8 test set (all scans, all patches)
X_test_e8 = np.concatenate(e8_uninfected_scans + e8_ucbsv_scans)
y_test_e8 = np.concatenate([np.full(len(e8_uninfected_scans) * 9, 0), np.full(len(e8_ucbsv_scans) * 9, 1)])

# --- Result Storage ---
e11_holdout_accuracies = []
e8_external_accuracies = []
best_params_list = []

print(f'Starting {N_REPEATS} iterations of the experiment...')

Loaded E11 data: 6 uninfected, 6 infected.
Loaded E8 data: 18 uninfected, 18 infected.
Starting 100 iterations of the experiment...


In [8]:
# --- Main Loop ---
for i in range(N_REPEATS):
    print(f'--- Iteration {i + 1}/{N_REPEATS} ---')
    # 1. Split E11 data: Isolate 1 of each class for hold-out testing
    random.shuffle(e11_uninfected_scans)
    random.shuffle(e11_ucbsv_scans)
    e11_test_scans = [e11_uninfected_scans[0], e11_ucbsv_scans[0]]
    e11_train_val_scans = e11_uninfected_scans[1:] + e11_ucbsv_scans[1:]
    # 2. Prepare training/validation and E11-holdout sets
    X_train_val = np.concatenate(e11_train_val_scans)
    y_train_val = np.concatenate([np.full(len(e11_uninfected_scans[1:]) * 9, 0), np.full(len(e11_ucbsv_scans[1:]) * 9, 1)])
    X_test_e11 = np.concatenate(e11_test_scans)
    y_test_e11 = np.concatenate([np.full(9, 0), np.full(9, 1)])
    # 3. Scale data: Fit ONLY on the training/validation data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_e11_scaled = scaler.transform(X_test_e11)
    X_test_e8_scaled = scaler.transform(X_test_e8)
    # 4. Hyperparameter tuning using GridSearchCV on the 10-scan set
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    svm = SVC()
    grid_search = GridSearchCV(svm, PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    best_params = grid_search.best_params_
    best_params_list.append(best_params)
    # 5. Train the final model on the entire 10-scan set with the best parameters
    final_model = SVC(**best_params).fit(X_train_val_scaled, y_train_val)
    # 6. Test and record accuracies
    # Test on E11 hold-out set
    y_pred_e11 = final_model.predict(X_test_e11_scaled)
    e11_holdout_accuracies.append(accuracy_score(y_test_e11, y_pred_e11))
    # Test on external E8 set
    y_pred_e8 = final_model.predict(X_test_e8_scaled)
    e8_external_accuracies.append(accuracy_score(y_test_e8, y_pred_e8))

--- Iteration 1/100 ---
--- Iteration 2/100 ---
--- Iteration 3/100 ---
--- Iteration 4/100 ---
--- Iteration 5/100 ---
--- Iteration 6/100 ---
--- Iteration 7/100 ---
--- Iteration 8/100 ---
--- Iteration 9/100 ---
--- Iteration 10/100 ---
--- Iteration 11/100 ---
--- Iteration 12/100 ---
--- Iteration 13/100 ---
--- Iteration 14/100 ---
--- Iteration 15/100 ---
--- Iteration 16/100 ---
--- Iteration 17/100 ---
--- Iteration 18/100 ---
--- Iteration 19/100 ---
--- Iteration 20/100 ---
--- Iteration 21/100 ---
--- Iteration 22/100 ---
--- Iteration 23/100 ---
--- Iteration 24/100 ---
--- Iteration 25/100 ---
--- Iteration 26/100 ---
--- Iteration 27/100 ---
--- Iteration 28/100 ---
--- Iteration 29/100 ---
--- Iteration 30/100 ---
--- Iteration 31/100 ---
--- Iteration 32/100 ---
--- Iteration 33/100 ---
--- Iteration 34/100 ---
--- Iteration 35/100 ---
--- Iteration 36/100 ---
--- Iteration 37/100 ---
--- Iteration 38/100 ---
--- Iteration 39/100 ---
--- Iteration 40/100 ---
--- Itera

In [9]:
# --- Analyze and Report Final Results ---
e11_holdout_accuracies = np.array(e11_holdout_accuracies)
e8_external_accuracies = np.array(e8_external_accuracies)

# E11 Hold-out Results
mean_e11 = np.mean(e11_holdout_accuracies)
std_e11 = np.std(e11_holdout_accuracies)
print('--- Test on E11 Hold-out Set (2 leaves) ---')
print(f'Method: In each of {N_REPEATS} iterations, 1 uninfected and 1 infected leaf from E11 were held out for testing.')
print(f'The model was trained on the remaining 10 leaves from E11.')
print(f'Mean Accuracy: {mean_e11 * 100:.2f}%')
print(f'Standard Deviation: {std_e11 * 100:.2f}%')
print('-' * 40)

# E8 External Test Results
mean_e8 = np.mean(e8_external_accuracies)
std_e8 = np.std(e8_external_accuracies)
print('\n--- Test on External E8 Dataset (36 leaves) ---')
print(f'Method: The same {N_REPEATS} models trained on E11 were tested against the full, unseen E8 dataset.')
print(f'Mean Accuracy: {mean_e8 * 100:.2f}%')
print(f'Standard Deviation: {std_e8 * 100:.2f}%')
print('-' * 40)

# Hyperparameter Analysis
print('\n--- Hyperparameter Selection ---')
params_df = pd.DataFrame(best_params_list)
print('Frequency of best parameters selected by GridSearchCV across all iterations:')
print(params_df.value_counts())

--- Test on E11 Hold-out Set (2 leaves) ---
Method: In each of 100 iterations, 1 uninfected and 1 infected leaf from E11 were held out for testing.
The model was trained on the remaining 10 leaves from E11.
Mean Accuracy: 91.11%
Standard Deviation: 9.88%
----------------------------------------

--- Test on External E8 Dataset (36 leaves) ---
Method: The same 100 models trained on E11 were tested against the full, unseen E8 dataset.
Mean Accuracy: 47.63%
Standard Deviation: 2.70%
----------------------------------------

--- Hyperparameter Selection ---
Frequency of best parameters selected by GridSearchCV across all iterations:
C    gamma  kernel
100  0.01   rbf       49
10   0.10   rbf       23
1    1.00   rbf       12
10   0.01   rbf        7
     1.00   rbf        4
100  0.10   rbf        4
1    0.10   rbf        1
Name: count, dtype: int64
