In [1]:
import scipy.io
import numpy as np
import os
import random
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
from collections import Counter

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Define Full, Explicit Dataset Paths ---
# E8 Paths
E8_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/Uninfected/E8_TME204_28dpi_Dec_16_2020_Uninfected_11_06_23_33.mat'
E8_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E8-TME204/E8_TME204_28dpi_Dec_16_2020/UCBSV/E8_TME204_28dpi_Dec_16_2020_UCBSV_11_06_23_36.mat'

# E6 Paths
E6_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/Uninfected/E6_TME204_28dpi_Jul_29_2020_Uninfected_11_06_23_29.mat'
E6_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E6-TME204/E6_TME204_28dpi_Jul_29_2020/UCBSV/E6_TME204_28dpi_Jul_29_2020_UCBSV_11_06_23_24.mat'

# E4 Paths
E4_UNINFECTED_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E4-TME204/E4_TME204_28dpi_Mar_5_2020/Uninfected/E4_TME204_28dpi_Mar_5_2020_Uninfected_11_06_23_18.mat'
E4_UCBSV_PATH = 'data/TME204-Patch_E4_E6_E8_28DPI_Dataset/E4-TME204/E4_TME204_28dpi_Mar_5_2020/UCBSV/E4_TME204_28dpi_Mar_5_2020_UCBSV_11_06_23_21.mat'

print("File paths defined.")

def load_scans_from_mat(file_path):
    mat_data = scipy.io.loadmat(file_path, squeeze_me=True)
    all_scans_struct = mat_data['Patch']
    if all_scans_struct.ndim == 0: all_scans_struct = np.array([all_scans_struct])
    return [s['mean_values'].item() if s['mean_values'].ndim == 0 else s['mean_values'] for s in all_scans_struct]

print("Data loading function defined.")

File paths defined.
Data loading function defined.


In [3]:
# --- Configuration ---
N_REPEATS = 100
PARAM_GRID = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
# Training pool is 70, hold-out is 2, leaving 68 for training (34 of each). 17 is a good number of folds.
CV_FOLDS = 17

# --- Data Loading ---
# E8 Data
e8_uninfected_scans = load_scans_from_mat(E8_UNINFECTED_PATH)
e8_ucbsv_scans = load_scans_from_mat(E8_UCBSV_PATH)
# E6 Data
e6_uninfected_scans = load_scans_from_mat(E6_UNINFECTED_PATH)
e6_ucbsv_scans = load_scans_from_mat(E6_UCBSV_PATH)
# E4 Data
e4_uninfected_scans = load_scans_from_mat(E4_UNINFECTED_PATH)
e4_ucbsv_scans = load_scans_from_mat(E4_UCBSV_PATH)

# --- Combine E8 and E6 for the training pool ---
combined_uninfected_scans = e8_uninfected_scans + e6_uninfected_scans
combined_infected_scans_full = e8_ucbsv_scans + e6_ucbsv_scans
print(f"Loaded combined training data: {len(combined_uninfected_scans)} uninfected, {len(combined_infected_scans_full)} infected.")

# --- Prepare the full E4 test set ---
X_test_e4_patches = np.concatenate(e4_uninfected_scans + e4_ucbsv_scans)
y_true_e4_scans = np.concatenate([np.full(len(e4_uninfected_scans), 0), np.full(len(e4_ucbsv_scans), 1)])

# --- Result Storage ---
holdout_accuracies = []
e4_majority_vote_accuracies = []
best_params_list = []

print(f"\nStarting {N_REPEATS} iterations of the experiment (Train on E8+E6, Test on E4)...")

# --- Main Loop ---
for i in range(N_REPEATS):
    print(f"--- Iteration {i + 1}/{N_REPEATS} ---")

    # 1. Create a balanced training pool (35 of each class)
    random.shuffle(combined_infected_scans_full)
    infected_selected = combined_infected_scans_full[:35]
    
    training_pool_uninfected = list(combined_uninfected_scans)
    training_pool_infected = list(infected_selected)
    
    # 2. Split the balanced pool: 2 for hold-out, 68 for train/validation
    random.shuffle(training_pool_uninfected)
    random.shuffle(training_pool_infected)
    
    holdout_scans = [training_pool_uninfected[0], training_pool_infected[0]]
    train_val_scans = training_pool_uninfected[1:] + training_pool_infected[1:] # 68 scans total

    # 3. Prepare patch-level data
    X_train_val = np.concatenate(train_val_scans)
    y_train_val = np.concatenate([np.full(34 * 9, 0), np.full(34 * 9, 1)])
    
    X_test_holdout = np.concatenate(holdout_scans)
    y_test_holdout = np.concatenate([np.full(9, 0), np.full(9, 1)])

    # 4. Scale data
    scaler = StandardScaler().fit(X_train_val)
    X_train_val_scaled = scaler.transform(X_train_val)
    X_test_holdout_scaled = scaler.transform(X_test_holdout)
    X_test_e4_scaled = scaler.transform(X_test_e4_patches)
    
    # 5. Hyperparameter tuning
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=i)
    grid_search = GridSearchCV(SVC(), PARAM_GRID, cv=cv_splitter, n_jobs=-1)
    grid_search.fit(X_train_val_scaled, y_train_val)
    
    best_params_list.append(grid_search.best_params_)
    
    # 6. Train final model
    final_model = SVC(**grid_search.best_params_).fit(X_train_val_scaled, y_train_val)
    
    # 7. Test and record accuracies
    # a) Internal hold-out
    y_pred_holdout = final_model.predict(X_test_holdout_scaled)
    holdout_accuracies.append(accuracy_score(y_test_holdout, y_pred_holdout))
    
    # b) External E4 set (Majority Vote)
    y_pred_e4_patches = final_model.predict(X_test_e4_scaled)
    y_pred_e4_scans = []
    current_patch_index = 0
    for scan_group in (e4_uninfected_scans, e4_ucbsv_scans):
        for scan in scan_group:
            num_patches = len(scan)
            scan_patch_preds = y_pred_e4_patches[current_patch_index : current_patch_index + num_patches]
            current_patch_index += num_patches
            majority_vote = Counter(scan_patch_preds).most_common(1)[0][0]
            y_pred_e4_scans.append(majority_vote)
            
    e4_majority_vote_accuracies.append(accuracy_score(y_true_e4_scans, y_pred_e4_scans))

print("\n--- Experiment Complete ---")

Loaded combined training data: 35 uninfected, 36 infected.

Starting 100 iterations of the experiment (Train on E8+E6, Test on E4)...
--- Iteration 1/100 ---
--- Iteration 2/100 ---
--- Iteration 3/100 ---
--- Iteration 4/100 ---
--- Iteration 5/100 ---
--- Iteration 6/100 ---
--- Iteration 7/100 ---
--- Iteration 8/100 ---
--- Iteration 9/100 ---
--- Iteration 10/100 ---
--- Iteration 11/100 ---
--- Iteration 12/100 ---
--- Iteration 13/100 ---
--- Iteration 14/100 ---
--- Iteration 15/100 ---
--- Iteration 16/100 ---
--- Iteration 17/100 ---
--- Iteration 18/100 ---
--- Iteration 19/100 ---
--- Iteration 20/100 ---
--- Iteration 21/100 ---
--- Iteration 22/100 ---
--- Iteration 23/100 ---
--- Iteration 24/100 ---
--- Iteration 25/100 ---
--- Iteration 26/100 ---
--- Iteration 27/100 ---
--- Iteration 28/100 ---
--- Iteration 29/100 ---
--- Iteration 30/100 ---
--- Iteration 31/100 ---
--- Iteration 32/100 ---
--- Iteration 33/100 ---
--- Iteration 34/100 ---
--- Iteration 35/100 ---


In [4]:
# --- Analyze and Report Final Results ---
holdout_accuracies_np = np.array(holdout_accuracies)
e4_majority_vote_accuracies_np = np.array(e4_majority_vote_accuracies)

# Hold-out Test Results
mean_holdout = np.mean(holdout_accuracies_np)
std_holdout = np.std(holdout_accuracies_np)
print("\n--- Test on E8+E6 Hold-out Set (2 leaves, Patch-Level Accuracy) ---")
print(f"Mean Accuracy: {mean_holdout * 100:.2f}%")
print(f"Standard Deviation: {std_holdout * 100:.2f}%")
print("-" * 60)

# E4 External Test Results (Majority Vote)
mean_e4 = np.mean(e4_majority_vote_accuracies_np)
std_e4 = np.std(e4_majority_vote_accuracies_np)
print("\n--- Test on External E4 Dataset (Scan-Level, Majority-Vote Accuracy) ---")
print(f"Mean Scan-Level Accuracy: {mean_e4 * 100:.2f}%")
print(f"Standard Deviation: {std_e4 * 100:.2f}%")
print("-" * 60)

# Hyperparameter Analysis
print("\n--- Hyperparameter Selection Analysis ---")
params_df = pd.DataFrame(best_params_list)
param_counts = params_df.groupby(['C', 'gamma', 'kernel']).size().reset_index(name='counts')
print(f"Frequency of best parameters selected by GridSearchCV across {N_REPEATS} iterations:")
print(param_counts.sort_values(by='counts', ascending=False))


--- Test on E8+E6 Hold-out Set (2 leaves, Patch-Level Accuracy) ---
Mean Accuracy: 94.00%
Standard Deviation: 8.49%
------------------------------------------------------------

--- Test on External E4 Dataset (Scan-Level, Majority-Vote Accuracy) ---
Mean Scan-Level Accuracy: 33.33%
Standard Deviation: 0.00%
------------------------------------------------------------

--- Hyperparameter Selection Analysis ---
Frequency of best parameters selected by GridSearchCV across 100 iterations:
     C  gamma kernel  counts
0  100   0.01    rbf      97
1  100   0.10    rbf       3
