# Expanding the experiments to do cross trials, we'll start by training on E8 and testing on E6. 

In [None]:
import scipy.io
import numpy as np
import os
import random
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pandas as pd

# Suppress warnings for cleaner output, especially from sklearn about failed folds
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

In [None]:
# --- Define Full, Explicit Dataset Paths ---
E8_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat'
E8_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'
E6_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/Uninfected/E6_TME204_28dpi_Jul_29_2020_Uninfected_11_06_23_29.mat'
E6_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/UCBSV/E6_TME204_28dpi_Jul_29_2020_UCBSV_11_06_23_24.mat'
print("File paths defined.")

def load_scans_from_mat(file_path, label):
    """
    Loads all scans from a .mat file, keeping patches grouped by scan.
    
    Args:
        file_path (str): The full path to the .mat file.
        label (int): The class label to assign to all scans in this file (0 or 1).

    Returns:
        list: A list of NumPy arrays, where each array contains the 
              feature vectors (patches) for a single leaf scan.
    """
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    
    if all_scans_struct.ndim == 0:
        all_scans_struct = np.array([all_scans_struct])
        
    scans_data = []
    for scan_struct in all_scans_struct:
        mean_vals = scan_struct['mean_values']
        
        # Handles both "boxed" and direct array formats
        if mean_vals.ndim == 0:
            scans_data.append(mean_vals.item())
        else:
            scans_data.append(mean_vals)
            
    return scans_data

print("Data loading function defined.")

In [None]:
# --- Main Experiment: Monte Carlo Cross-Validation ---

# --- Configuration ---
N_REPEATS = 100  # Number of times to repeat the experiment
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
CV_FOLDS = 20 # Number of folds for GridSearchCV as requested

# --- Data Loading (once at the beginning) ---
e8_uninfected_scans = load_scans_from_mat(E8_UNINFECTED_PATH, 0)
e8_ucbsv_scans = load_scans_from_mat(E8_UCBSV_PATH, 1)

e6_uninfected_scans = load_scans_from_mat(E6_UNINFECTED_PATH, 0)
e6_ucbsv_scans = load_scans_from_mat(E6_UCBSV_PATH, 1)

# Prepare the full E6 test set (it remains the same for all repeats)
X_test_e6 = np.concatenate(e6_uninfected_scans + e6_ucbsv_scans)
y_test_e6 = np.concatenate([np.full(len(e6_uninfected_scans) * 9, 0), np.full(len(e6_ucbsv_scans) * 9, 1)])

# --- Result Storage ---
e8_holdout_accuracies = []
e6_external_accuracies = []
best_params_list = []

print(f"Starting {N_REPEATS} iterations of the experiment...")

# --- Main Loop ---
for i in range(N_REPEATS):
    print(f"--- Iteration {i + 1}/{N_REPEATS} ---")

    # 1. Split E8 data: Isolate 1 of each class for hold-out testing
    random.shuffle(e8_uninfected_scans)
    random.shuffle(e8_ucbsv_scans)
    
    e8_test_scans = [e8_uninfected_scans[0], e8_ucbsv_scans[0]]
    e8_train_val_scans = e8_uninfected_scans[1:] + e8_ucbsv_scans[1:]
    
    # 2. Prepare training/validation and E8-holdout sets
    X_train_val = np.concatenate(e8_train_val_scans)
    y_train_val = np.concatenate([np.full(len(e8_uninfected_scans[1:]) * 9, 0), np.full(len(e8_ucbsv_scans[1:]) * 9, 1)])
    
    X_test_e8 = np.concatenate(e8_test_scans)
    y_test_e8 = np.concatenate([np.full(9, 0), np.full(9, 1)])

    # 3. Scale data: Fit ONLY on the training/validation data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_e8_scaled = scaler.transform(X_test_e8)
    X_test_e6_scaled = scaler.transform(X_test_e6)
    
    # 4. Hyperparameter tuning using GridSearchCV on the 34-scan set
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    svm = SVC()
    grid_search = GridSearchCV(svm, PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    
    best_params = grid_search.best_params_
    best_params_list.append(best_params)
    
    # 5. Train the final model on the entire 34-scan set with the best parameters
    final_model = SVC(**best_params).fit(X_train_val_scaled, y_train_val)
    
    # 6. Test and record accuracies
    # Test on E8 hold-out set
    y_pred_e8 = final_model.predict(X_test_e8_scaled)
    e8_holdout_accuracies.append(accuracy_score(y_test_e8, y_pred_e8))
    
    # Test on external E6 set
    y_pred_e6 = final_model.predict(X_test_e6_scaled)
    e6_external_accuracies.append(accuracy_score(y_test_e6, y_pred_e6))

print("\n--- Experiment Complete ---")

In [None]:
# --- Analyze and Report Final Results ---

# Convert results to NumPy arrays for easy calculation
e8_holdout_accuracies = np.array(e8_holdout_accuracies)
e6_external_accuracies = np.array(e6_external_accuracies)

# --- E8 Hold-out Test Results ---
mean_e8 = np.mean(e8_holdout_accuracies)
std_e8 = np.std(e8_holdout_accuracies)
print("--- Test on E8 Hold-out Set (2 leaves) ---")
print(f"Method: In each of {N_REPEATS} iterations, 1 uninfected and 1 infected leaf from E8 were held out for testing.")
print(f"The model was trained on the remaining 34 leaves from E8 after hyperparameter tuning.")
print(f"Mean Accuracy: {mean_e8 * 100:.2f}%")
print(f"Standard Deviation: {std_e8 * 100:.2f}%")
print("-" * 50)

# --- E6 External Test Results ---
mean_e6 = np.mean(e6_external_accuracies)
std_e6 = np.std(e6_external_accuracies)
print("\n--- Test on External E6 Dataset (36 leaves) ---")
print(f"Method: The same {N_REPEATS} models trained on E8 were tested against the full, unseen E6 dataset.")
print(f"Mean Accuracy: {mean_e6 * 100:.2f}%")
print(f"Standard Deviation: {std_e6 * 100:.2f}%")
print("-" * 50)

# --- Hyperparameter Analysis ---
print("\n--- Hyperparameter Selection Analysis ---")
params_df = pd.DataFrame(best_params_list)

# --- THIS IS THE FIX ---
# Use the correct column names ('svc__C', etc.) as defined in your PARAM_GRID.
param_counts = params_df.groupby(['svc__C', 'svc__gamma', 'svc__kernel']).size().reset_index(name='counts')

print(f"Frequency of best parameters selected by GridSearchCV across {N_REPEATS} iterations:")
print(param_counts.sort_values(by='counts', ascending=False))