In [1]:
import scipy.io
import numpy as np
import os
import random
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
from collections import Counter

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Define Full, Explicit Dataset Paths ---
# E6 Paths (for Training)
E6_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/Uninfected/E6_TME204_28dpi_Jul_29_2020_Uninfected_11_06_23_29.mat'
E6_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/UCBSV/E6_TME204_28dpi_Jul_29_2020_UCBSV_11_06_23_24.mat'

# E8 Paths (for External Testing)
E8_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat'
E8_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'

print("File paths defined.")

def load_scans_from_mat(file_path):
    """
    Loads all scans from a .mat file, returning a list of patch arrays.
    """
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    
    if all_scans_struct.ndim == 0:
        all_scans_struct = np.array([all_scans_struct])
        
    scans_data = []
    for scan_struct in all_scans_struct:
        mean_vals = scan_struct['mean_values']
        if mean_vals.ndim == 0:
            scans_data.append(mean_vals.item())
        else:
            scans_data.append(mean_vals)
            
    return scans_data

print("Data loading function defined.")

File paths defined.
Data loading function defined.


In [3]:
# --- Configuration ---
N_REPEATS = 100
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
# E6 has 18 uninfected and 18 infected scans. Hold-out is 2, so 34 for training. 
# 17 folds is a good choice for balanced 2-sample validation splits.
CV_FOLDS = 17

# --- Data Loading (once at the beginning) ---
# Training Data (E6)
e6_uninfected_scans = load_scans_from_mat(E6_UNINFECTED_PATH)
e6_ucbsv_scans = load_scans_from_mat(E6_UCBSV_PATH)
print(f"Loaded E6 data: {len(e6_uninfected_scans)} uninfected scans, {len(e6_ucbsv_scans)} infected scans.")

# External Test Data (E8)
e8_uninfected_scans = load_scans_from_mat(E8_UNINFECTED_PATH)
e8_ucbsv_scans = load_scans_from_mat(E8_UCBSV_PATH)
print(f"Loaded E8 data: {len(e8_uninfected_scans)} uninfected scans, {len(e8_ucbsv_scans)} infected scans.")


# --- Prepare the full E8 test set (patches and true scan-level labels) ---
X_test_e8_patches = np.concatenate(e8_uninfected_scans + e8_ucbsv_scans)
# True labels at the SCAN level for E8
y_true_e8_scans = np.concatenate([
    np.full(len(e8_uninfected_scans), 0), # 0 for uninfected
    np.full(len(e8_ucbsv_scans), 1)       # 1 for infected
])


# --- Result Storage ---
e6_holdout_accuracies = []
e8_majority_vote_accuracies = []
best_params_list = []

print(f"\nStarting {N_REPEATS} iterations of the experiment (Train on E6, Test on E8)...")

# --- Main Loop ---
for i in range(N_REPEATS):
    print(f"--- Iteration {i + 1}/{N_REPEATS} ---")

    # 1. Split E6 data: Isolate 1 of each class for hold-out testing
    random.shuffle(e6_uninfected_scans)
    random.shuffle(e6_ucbsv_scans)
    
    e6_test_scans = [e6_uninfected_scans[0], e6_ucbsv_scans[0]]
    e6_train_val_scans = e6_uninfected_scans[1:] + e6_ucbsv_scans[1:] # 34 scans total
    
    # 2. Prepare training/validation and E6-holdout sets (patch level)
    X_train_val = np.concatenate(e6_train_val_scans)
    y_train_val = np.concatenate([np.full(len(e6_uninfected_scans[1:]) * 9, 0), np.full(len(e6_ucbsv_scans[1:]) * 9, 1)])
    
    X_test_e6_patches = np.concatenate(e6_test_scans)
    y_test_e6_patches = np.concatenate([np.full(9, 0), np.full(9, 1)])

    # 3. Scale data: Fit ONLY on the E6 training/validation data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_e6_scaled = scaler.transform(X_test_e6_patches)
    X_test_e8_scaled = scaler.transform(X_test_e8_patches)
    
    # 4. Hyperparameter tuning using GridSearchCV on the 34-scan E6 set
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    grid_search = GridSearchCV(SVC(), PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    
    best_params = grid_search.best_params_
    best_params_list.append(best_params)
    
    # 5. Train the final model on the entire 34-scan E6 set with best parameters
    final_model = SVC(**best_params).fit(X_train_val_scaled, y_train_val)
    
    # 6. Test and record accuracies
    # a) Test on E6 hold-out set (patch-level accuracy)
    y_pred_e6 = final_model.predict(X_test_e6_scaled)
    e6_holdout_accuracies.append(accuracy_score(y_test_e6_patches, y_pred_e6))
    
    # b) Test on external E8 set using MAJORITY VOTING
    y_pred_e8_patches = final_model.predict(X_test_e8_scaled)
    
    y_pred_e8_scans = []
    current_patch_index = 0
    # Iterate through the original E8 scan groups to apply majority vote
    for scan_group in (e8_uninfected_scans, e8_ucbsv_scans):
        for _ in scan_group: # The variable 'scan' is not used, so '_' is conventional
            # Assuming each scan has 9 patches
            num_patches = 9 
            scan_patch_preds = y_pred_e8_patches[current_patch_index : current_patch_index + num_patches]
            current_patch_index += num_patches
            
            # Perform majority voting
            vote_counts = Counter(scan_patch_preds)
            majority_vote = vote_counts.most_common(1)[0][0]
            y_pred_e8_scans.append(majority_vote)
            
    # Calculate accuracy at the scan level for E8
    e8_majority_vote_accuracies.append(accuracy_score(y_true_e8_scans, y_pred_e8_scans))


print("\n--- Experiment Complete ---")

Loaded E6 data: 17 uninfected scans, 18 infected scans.
Loaded E8 data: 18 uninfected scans, 18 infected scans.

Starting 100 iterations of the experiment (Train on E6, Test on E8)...
--- Iteration 1/100 ---
--- Iteration 2/100 ---
--- Iteration 3/100 ---
--- Iteration 4/100 ---
--- Iteration 5/100 ---
--- Iteration 6/100 ---
--- Iteration 7/100 ---
--- Iteration 8/100 ---
--- Iteration 9/100 ---
--- Iteration 10/100 ---
--- Iteration 11/100 ---
--- Iteration 12/100 ---
--- Iteration 13/100 ---
--- Iteration 14/100 ---
--- Iteration 15/100 ---
--- Iteration 16/100 ---
--- Iteration 17/100 ---
--- Iteration 18/100 ---
--- Iteration 19/100 ---
--- Iteration 20/100 ---
--- Iteration 21/100 ---
--- Iteration 22/100 ---
--- Iteration 23/100 ---
--- Iteration 24/100 ---
--- Iteration 25/100 ---
--- Iteration 26/100 ---
--- Iteration 27/100 ---
--- Iteration 28/100 ---
--- Iteration 29/100 ---
--- Iteration 30/100 ---
--- Iteration 31/100 ---
--- Iteration 32/100 ---
--- Iteration 33/100 ---


In [4]:
# --- Analyze and Report Final Results ---
e6_holdout_accuracies_np = np.array(e6_holdout_accuracies)
e8_majority_vote_accuracies_np = np.array(e8_majority_vote_accuracies)

# E6 Hold-out Test Results
mean_e6 = np.mean(e6_holdout_accuracies_np)
std_e6 = np.std(e6_holdout_accuracies_np)
print("\n--- Test on E6 Hold-out Set (2 leaves, Patch-Level Accuracy) ---")
print(f"Method: In each of {N_REPEATS} iterations, 1 uninfected and 1 infected leaf from E6 were held out.")
print(f"The model was trained on the remaining 34 leaves from E6.")
print(f"Mean Accuracy: {mean_e6 * 100:.2f}%")
print(f"Standard Deviation: {std_e6 * 100:.2f}%")
print("-" * 60)

# E8 External Test Results (Majority Vote)
mean_e8 = np.mean(e8_majority_vote_accuracies_np)
std_e8 = np.std(e8_majority_vote_accuracies_np)
print("\n--- Test on External E8 Dataset (Scan-Level, Majority-Vote Accuracy) ---")
print(f"Method: The same {N_REPEATS} models trained on E6 were tested against the full, unseen E8 dataset.")
print("For each scan in E8, the class was predicted based on the majority vote of its 9 patches.")
print(f"Mean Scan-Level Accuracy: {mean_e8 * 100:.2f}%")
print(f"Standard Deviation: {std_e8 * 100:.2f}%")
print("-" * 60)

# Hyperparameter Analysis
print("\n--- Hyperparameter Selection Analysis ---")
params_df = pd.DataFrame(best_params_list)
param_counts = params_df.groupby(['C', 'gamma', 'kernel']).size().reset_index(name='counts')
print(f"Frequency of best parameters selected by GridSearchCV across {N_REPEATS} iterations:")
print(param_counts.sort_values(by='counts', ascending=False))


--- Test on E6 Hold-out Set (2 leaves, Patch-Level Accuracy) ---
Method: In each of 100 iterations, 1 uninfected and 1 infected leaf from E6 were held out.
The model was trained on the remaining 34 leaves from E6.
Mean Accuracy: 93.33%
Standard Deviation: 10.06%
------------------------------------------------------------

--- Test on External E8 Dataset (Scan-Level, Majority-Vote Accuracy) ---
Method: The same 100 models trained on E6 were tested against the full, unseen E8 dataset.
For each scan in E8, the class was predicted based on the majority vote of its 9 patches.
Mean Scan-Level Accuracy: 82.64%
Standard Deviation: 3.05%
------------------------------------------------------------

--- Hyperparameter Selection Analysis ---
Frequency of best parameters selected by GridSearchCV across 100 iterations:
     C  gamma kernel  counts
1  100   0.01    rbf      80
2  100   0.10    rbf      11
0   10   0.10    rbf       9
