In [1]:
import scipy.io
import numpy as np
import os
import random
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
from collections import Counter

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Define Full, Explicit Dataset Paths ---
# E6 Paths (for Training)
E6_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/Uninfected/E6_TME204_28dpi_Jul_29_2020_Uninfected_11_06_23_29.mat'
E6_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/UCBSV/E6_TME204_28dpi_Jul_29_2020_UCBSV_11_06_23_24.mat'

# E4 Paths (for External Testing)
E4_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E4-TME204/E4_TME204_28dpi_Mar_5_2020/Uninfected/E4_TME204_28dpi_Mar_5_2020_Uninfected_11_06_23_18.mat'
E4_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E4-TME204/E4_TME204_28dpi_Mar_5_2020/UCBSV/E4_TME204_28dpi_Mar_5_2020_UCBSV_11_06_23_21.mat'

print("File paths defined.")

def load_scans_from_mat(file_path):
    """
    Loads all scans from a .mat file, returning a list of patch arrays.
    """
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    
    if all_scans_struct.ndim == 0:
        all_scans_struct = np.array([all_scans_struct])
        
    scans_data = []
    for scan_struct in all_scans_struct:
        mean_vals = scan_struct['mean_values']
        if mean_vals.ndim == 0:
            scans_data.append(mean_vals.item())
        else:
            scans_data.append(mean_vals)
            
    return scans_data

print("Data loading function defined.")

File paths defined.
Data loading function defined.


In [3]:
# --- Configuration -- -
N_REPEATS = 100
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
# With a balanced pool of 34, we hold out 2, leaving 32 (16 of each class) for training.
# 16 is a good number of folds for StratifiedKFold.
CV_FOLDS = 16 

# --- Data Loading (once at the beginning) ---
# Training Data Pool (E6)
e6_uninfected_scans_full = load_scans_from_mat(E6_UNINFECTED_PATH)
e6_ucbsv_scans_full = load_scans_from_mat(E6_UCBSV_PATH)
print(f"Loaded E6 data: {len(e6_uninfected_scans_full)} uninfected scans, {len(e6_ucbsv_scans_full)} infected scans.")

# External Test Data (E4)
e4_uninfected_scans = load_scans_from_mat(E4_UNINFECTED_PATH)
e4_ucbsv_scans = load_scans_from_mat(E4_UCBSV_PATH)
print(f"Loaded E4 data: {len(e4_uninfected_scans)} uninfected scans, {len(e4_ucbsv_scans)} infected scans.")


# --- Prepare the full E4 test set (patches and true scan-level labels) ---
X_test_e4_patches = np.concatenate(e4_uninfected_scans + e4_ucbsv_scans)
# True labels at the SCAN level for E4
y_true_e4_scans = np.concatenate([
    np.full(len(e4_uninfected_scans), 0), # 0 for uninfected
    np.full(len(e4_ucbsv_scans), 1)       # 1 for infected
])


# --- Result Storage ---
e6_holdout_accuracies = []
e4_majority_vote_accuracies = []
best_params_list = []

print(f"\nStarting {N_REPEATS} iterations of the experiment (Train on E6, Test on E4)...")

# --- Main Loop ---
for i in range(N_REPEATS):
    print(f"--- Iteration {i + 1}/{N_REPEATS} ---")

    # 1. Create a balanced E6 training pool for this iteration (17 of each class)
    random.shuffle(e6_ucbsv_scans_full)
    e6_ucbsv_selected = e6_ucbsv_scans_full[:17]
    
    e6_training_pool_uninfected = list(e6_uninfected_scans_full)
    e6_training_pool_infected = list(e6_ucbsv_selected)
    
    # 2. Split the balanced pool: 2 for hold-out, 32 for train/validation
    random.shuffle(e6_training_pool_uninfected)
    random.shuffle(e6_training_pool_infected)
    
    e6_test_scans = [e6_training_pool_uninfected[0], e6_training_pool_infected[0]]
    e6_train_val_scans = e6_training_pool_uninfected[1:] + e6_training_pool_infected[1:] # 32 scans total

    # 3. Prepare patch-level data for training and E6 hold-out
    X_train_val = np.concatenate(e6_train_val_scans)
    y_train_val = np.concatenate([np.full(16 * 9, 0), np.full(16 * 9, 1)]) # 16 scans of each class
    
    X_test_e6_patches = np.concatenate(e6_test_scans)
    y_test_e6_patches = np.concatenate([np.full(9, 0), np.full(9, 1)])

    # 4. Scale data: Fit ONLY on the E6 training/validation data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_e6_scaled = scaler.transform(X_test_e6_patches)
    X_test_e4_scaled = scaler.transform(X_test_e4_patches)
    
    # 5. Hyperparameter tuning using GridSearchCV
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    grid_search = GridSearchCV(SVC(), PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    
    best_params = grid_search.best_params_
    best_params_list.append(best_params)
    
    # 6. Train the final model on the entire 32-scan set with best parameters
    final_model = SVC(**best_params).fit(X_train_val_scaled, y_train_val)
    
    # 7. Test and record accuracies
    # a) Test on E6 hold-out set (patch-level accuracy)
    y_pred_e6 = final_model.predict(X_test_e6_scaled)
    e6_holdout_accuracies.append(accuracy_score(y_test_e6_patches, y_pred_e6))
    
    # b) Test on external E4 set using MAJORITY VOTING
    y_pred_e4_patches = final_model.predict(X_test_e4_scaled)
    y_pred_e4_scans = []
    current_patch_index = 0
    for scan_group in (e4_uninfected_scans, e4_ucbsv_scans):
        for scan in scan_group:
            num_patches = len(scan)
            scan_patch_preds = y_pred_e4_patches[current_patch_index : current_patch_index + num_patches]
            current_patch_index += num_patches
            
            vote_counts = Counter(scan_patch_preds)
            majority_vote = vote_counts.most_common(1)[0][0]
            y_pred_e4_scans.append(majority_vote)
            
    e4_majority_vote_accuracies.append(accuracy_score(y_true_e4_scans, y_pred_e4_scans))


print("\n--- Experiment Complete ---")

Loaded E6 data: 17 uninfected scans, 18 infected scans.
Loaded E4 data: 24 uninfected scans, 12 infected scans.

Starting 100 iterations of the experiment (Train on E6, Test on E4)...
--- Iteration 1/100 ---
--- Iteration 2/100 ---
--- Iteration 3/100 ---
--- Iteration 4/100 ---
--- Iteration 5/100 ---
--- Iteration 6/100 ---
--- Iteration 7/100 ---
--- Iteration 8/100 ---
--- Iteration 9/100 ---
--- Iteration 10/100 ---
--- Iteration 11/100 ---
--- Iteration 12/100 ---
--- Iteration 13/100 ---
--- Iteration 14/100 ---
--- Iteration 15/100 ---
--- Iteration 16/100 ---
--- Iteration 17/100 ---
--- Iteration 18/100 ---
--- Iteration 19/100 ---
--- Iteration 20/100 ---
--- Iteration 21/100 ---
--- Iteration 22/100 ---
--- Iteration 23/100 ---
--- Iteration 24/100 ---
--- Iteration 25/100 ---
--- Iteration 26/100 ---
--- Iteration 27/100 ---
--- Iteration 28/100 ---
--- Iteration 29/100 ---
--- Iteration 30/100 ---
--- Iteration 31/100 ---
--- Iteration 32/100 ---
--- Iteration 33/100 ---


In [5]:
# --- Analyze and Report Final Results ---
e6_holdout_accuracies_np = np.array(e6_holdout_accuracies)
e4_majority_vote_accuracies_np = np.array(e4_majority_vote_accuracies)

# E6 Hold-out Test Results
mean_e6 = np.mean(e6_holdout_accuracies_np)
std_e6 = np.std(e6_holdout_accuracies_np)
print("\n--- Test on E6 Hold-out Set (2 leaves, Patch-Level Accuracy) ---")
print(f"Method: In each of {N_REPEATS} iterations, 1 uninfected and 1 infected leaf from a balanced E6 pool were held out.")
print(f"The model was trained on the remaining 32 leaves from E6.")
print(f"Mean Accuracy: {mean_e6 * 100:.2f}%")
print(f"Standard Deviation: {std_e6 * 100:.2f}%")
print("-" * 60)

# E4 External Test Results (Majority Vote)
mean_e4 = np.mean(e4_majority_vote_accuracies_np)
std_e4 = np.std(e4_majority_vote_accuracies_np)
print("\n--- Test on External E4 Dataset (Scan-Level, Majority-Vote Accuracy) ---")
print(f"Method: The same {N_REPEATS} models trained on E6 were tested against the full, unseen E4 dataset.")
print("For each scan in E4, the class was predicted based on the majority vote of its 9 patches.")
print(f"Mean Scan-Level Accuracy: {mean_e4 * 100:.2f}%")
print(f"Standard Deviation: {std_e4 * 100:.2f}%")
print("-" * 60)

# Hyperparameter Analysis
print("\n--- Hyperparameter Selection Analysis ---")
params_df = pd.DataFrame(best_params_list)
param_counts = params_df.groupby(['C', 'gamma', 'kernel']).size().reset_index(name='counts')
print(f"Frequency of best parameters selected by GridSearchCV across {N_REPEATS} iterations:")
print(param_counts.sort_values(by='counts', ascending=False))


--- Test on E6 Hold-out Set (2 leaves, Patch-Level Accuracy) ---
Method: In each of 100 iterations, 1 uninfected and 1 infected leaf from a balanced E6 pool were held out.
The model was trained on the remaining 32 leaves from E6.
Mean Accuracy: 92.67%
Standard Deviation: 11.38%
------------------------------------------------------------

--- Test on External E4 Dataset (Scan-Level, Majority-Vote Accuracy) ---
Method: The same 100 models trained on E6 were tested against the full, unseen E4 dataset.
For each scan in E4, the class was predicted based on the majority vote of its 9 patches.
Mean Scan-Level Accuracy: 37.19%
Standard Deviation: 10.80%
------------------------------------------------------------

--- Hyperparameter Selection Analysis ---
Frequency of best parameters selected by GridSearchCV across 100 iterations:
     C  gamma kernel  counts
1  100   0.01    rbf      84
0   10   0.10    rbf       8
2  100   0.10    rbf       8


In [6]:
# --- Define Full, Explicit Dataset Paths ---
# E4 Paths (for Training)
E4_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E4-TME204/E4_TME204_28dpi_Mar_5_2020/Uninfected/E4_TME204_28dpi_Mar_5_2020_Uninfected_11_06_23_18.mat'
E4_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E4-TME204/E4_TME204_28dpi_Mar_5_2020/UCBSV/E4_TME204_28dpi_Mar_5_2020_UCBSV_11_06_23_21.mat'

# E6 Paths (for External Testing)
E6_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/Uninfected/E6_TME204_28dpi_Jul_29_2020_Uninfected_11_06_23_29.mat'
E6_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/UCBSV/E6_TME204_28dpi_Jul_29_2020_UCBSV_11_06_23_24.mat'


print("File paths defined.")

def load_scans_from_mat(file_path):
    """
    Loads all scans from a .mat file, returning a list of patch arrays.
    """
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    
    if all_scans_struct.ndim == 0:
        all_scans_struct = np.array([all_scans_struct])
        
    scans_data = []
    for scan_struct in all_scans_struct:
        mean_vals = scan_struct['mean_values']
        if mean_vals.ndim == 0:
            scans_data.append(mean_vals.item())
        else:
            scans_data.append(mean_vals)
            
    return scans_data

print("Data loading function defined.")

File paths defined.
Data loading function defined.


In [7]:
# --- Configuration ---
N_REPEATS = 100
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
# With a balanced pool of 24, we hold out 2, leaving 22 (11 of each class) for training.
# 11 is a good number of folds for StratifiedKFold.
CV_FOLDS = 11

# --- Data Loading (once at the beginning) ---
# Training Data Pool (E4)
e4_uninfected_scans_full = load_scans_from_mat(E4_UNINFECTED_PATH) # 24 scans
e4_ucbsv_scans_full = load_scans_from_mat(E4_UCBSV_PATH)       # 12 scans
print(f"Loaded E4 data: {len(e4_uninfected_scans_full)} uninfected scans, {len(e4_ucbsv_scans_full)} infected scans.")

# External Test Data (E6)
e6_uninfected_scans = load_scans_from_mat(E6_UNINFECTED_PATH)
e6_ucbsv_scans = load_scans_from_mat(E6_UCBSV_PATH)
print(f"Loaded E6 data: {len(e6_uninfected_scans)} uninfected scans, {len(e6_ucbsv_scans)} infected scans.")


# --- Prepare the full E6 test set (patches and true scan-level labels) ---
X_test_e6_patches = np.concatenate(e6_uninfected_scans + e6_ucbsv_scans)
# True labels at the SCAN level for E6
y_true_e6_scans = np.concatenate([
    np.full(len(e6_uninfected_scans), 0), # 0 for uninfected
    np.full(len(e6_ucbsv_scans), 1)       # 1 for infected
])


# --- Result Storage ---
e4_holdout_accuracies = []
e6_majority_vote_accuracies = []
best_params_list = []

print(f"\nStarting {N_REPEATS} iterations of the experiment (Train on E4, Test on E6)...")

# --- Main Loop ---
for i in range(N_REPEATS):
    print(f"--- Iteration {i + 1}/{N_REPEATS} ---")

    # 1. Create a balanced E4 training pool for this iteration (12 of each class)
    random.shuffle(e4_uninfected_scans_full)
    e4_uninfected_selected = e4_uninfected_scans_full[:12]
    
    e4_training_pool_uninfected = list(e4_uninfected_selected)
    e4_training_pool_infected = list(e4_ucbsv_scans_full)
    
    # 2. Split the balanced pool: 2 for hold-out, 22 for train/validation
    random.shuffle(e4_training_pool_uninfected)
    random.shuffle(e4_training_pool_infected)
    
    e4_test_scans = [e4_training_pool_uninfected[0], e4_training_pool_infected[0]]
    e4_train_val_scans = e4_training_pool_uninfected[1:] + e4_training_pool_infected[1:] # 22 scans total

    # 3. Prepare patch-level data for training and E4 hold-out
    X_train_val = np.concatenate(e4_train_val_scans)
    y_train_val = np.concatenate([np.full(11 * 9, 0), np.full(11 * 9, 1)]) # 11 scans of each class
    
    X_test_e4_patches = np.concatenate(e4_test_scans)
    y_test_e4_patches = np.concatenate([np.full(9, 0), np.full(9, 1)])

    # 4. Scale data: Fit ONLY on the E4 training/validation data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_e4_scaled = scaler.transform(X_test_e4_patches)
    X_test_e6_scaled = scaler.transform(X_test_e6_patches)
    
    # 5. Hyperparameter tuning using GridSearchCV
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    grid_search = GridSearchCV(SVC(), PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    
    best_params = grid_search.best_params_
    best_params_list.append(best_params)
    
    # 6. Train the final model on the entire 22-scan set with best parameters
    final_model = SVC(**best_params).fit(X_train_val_scaled, y_train_val)
    
    # 7. Test and record accuracies
    # a) Test on E4 hold-out set (patch-level accuracy)
    y_pred_e4 = final_model.predict(X_test_e4_scaled)
    e4_holdout_accuracies.append(accuracy_score(y_test_e4_patches, y_pred_e4))
    
    # b) Test on external E6 set using MAJORITY VOTING
    y_pred_e6_patches = final_model.predict(X_test_e6_scaled)
    y_pred_e6_scans = []
    current_patch_index = 0
    for scan_group in (e6_uninfected_scans, e6_ucbsv_scans):
        for scan in scan_group:
            num_patches = len(scan)
            scan_patch_preds = y_pred_e6_patches[current_patch_index : current_patch_index + num_patches]
            current_patch_index += num_patches
            
            vote_counts = Counter(scan_patch_preds)
            majority_vote = vote_counts.most_common(1)[0][0]
            y_pred_e6_scans.append(majority_vote)
            
    e6_majority_vote_accuracies.append(accuracy_score(y_true_e6_scans, y_pred_e6_scans))


print("\n--- Experiment Complete ---")

Loaded E4 data: 24 uninfected scans, 12 infected scans.
Loaded E6 data: 17 uninfected scans, 18 infected scans.

Starting 100 iterations of the experiment (Train on E4, Test on E6)...
--- Iteration 1/100 ---
--- Iteration 2/100 ---
--- Iteration 3/100 ---
--- Iteration 4/100 ---
--- Iteration 5/100 ---
--- Iteration 6/100 ---
--- Iteration 7/100 ---
--- Iteration 8/100 ---
--- Iteration 9/100 ---
--- Iteration 10/100 ---
--- Iteration 11/100 ---
--- Iteration 12/100 ---
--- Iteration 13/100 ---
--- Iteration 14/100 ---
--- Iteration 15/100 ---
--- Iteration 16/100 ---
--- Iteration 17/100 ---
--- Iteration 18/100 ---
--- Iteration 19/100 ---
--- Iteration 20/100 ---
--- Iteration 21/100 ---
--- Iteration 22/100 ---
--- Iteration 23/100 ---
--- Iteration 24/100 ---
--- Iteration 25/100 ---
--- Iteration 26/100 ---
--- Iteration 27/100 ---
--- Iteration 28/100 ---
--- Iteration 29/100 ---
--- Iteration 30/100 ---
--- Iteration 31/100 ---
--- Iteration 32/100 ---
--- Iteration 33/100 ---


In [8]:
# --- Analyze and Report Final Results ---
e4_holdout_accuracies_np = np.array(e4_holdout_accuracies)
e6_majority_vote_accuracies_np = np.array(e6_majority_vote_accuracies)

# E4 Hold-out Test Results
mean_e4 = np.mean(e4_holdout_accuracies_np)
std_e4 = np.std(e4_holdout_accuracies_np)
print("\n--- Test on E4 Hold-out Set (2 leaves, Patch-Level Accuracy) ---")
print(f"Method: In each of {N_REPEATS} iterations, 1 uninfected and 1 infected leaf from a balanced E4 pool were held out.")
print(f"The model was trained on the remaining 22 leaves from E4.")
print(f"Mean Accuracy: {mean_e4 * 100:.2f}%")
print(f"Standard Deviation: {std_e4 * 100:.2f}%")
print("-" * 60)

# E6 External Test Results (Majority Vote)
mean_e6 = np.mean(e6_majority_vote_accuracies_np)
std_e6 = np.std(e6_majority_vote_accuracies_np)
print("\n--- Test on External E6 Dataset (Scan-Level, Majority-Vote Accuracy) ---")
print(f"Method: The same {N_REPEATS} models trained on E4 were tested against the full, unseen E6 dataset.")
print("For each scan in E6, the class was predicted based on the majority vote of its 9 patches.")
print(f"Mean Scan-Level Accuracy: {mean_e6 * 100:.2f}%")
print(f"Standard Deviation: {std_e6 * 100:.2f}%")
print("-" * 60)

# Hyperparameter Analysis
print("\n--- Hyperparameter Selection Analysis ---")
params_df = pd.DataFrame(best_params_list)
param_counts = params_df.groupby(['C', 'gamma', 'kernel']).size().reset_index(name='counts')
print(f"Frequency of best parameters selected by GridSearchCV across {N_REPEATS} iterations:")
print(param_counts.sort_values(by='counts', ascending=False))


--- Test on E4 Hold-out Set (2 leaves, Patch-Level Accuracy) ---
Method: In each of 100 iterations, 1 uninfected and 1 infected leaf from a balanced E4 pool were held out.
The model was trained on the remaining 22 leaves from E4.
Mean Accuracy: 64.44%
Standard Deviation: 20.85%
------------------------------------------------------------

--- Test on External E6 Dataset (Scan-Level, Majority-Vote Accuracy) ---
Method: The same 100 models trained on E4 were tested against the full, unseen E6 dataset.
For each scan in E6, the class was predicted based on the majority vote of its 9 patches.
Mean Scan-Level Accuracy: 54.43%
Standard Deviation: 5.47%
------------------------------------------------------------

--- Hyperparameter Selection Analysis ---
Frequency of best parameters selected by GridSearchCV across 100 iterations:
     C  gamma kernel  counts
0  100   0.01    rbf      65
1  100   0.10    rbf      35
