In [None]:
from sortedcontainers import SortedList
import gzip
import shutil
import numpy as np
import os
import pickle
import re
import pandas as pd
import imageio
import imgaug as ia
import xml.etree.ElementTree as ET
from PIL import Image, ImageColor
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.colors import ListedColormap
import seaborn as sns
from collections import defaultdict, deque
from scipy.ndimage import (
    gaussian_filter,
    label,
    find_objects,
    binary_dilation
)
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report
)
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Concatenate, Conv2D, MaxPooling2D, UpSampling2D, concatenate,
    BatchNormalization, Activation, Dense, Flatten, GlobalAveragePooling2D, Reshape
)
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from collections import defaultdict

In [None]:
def load_image_dicts(file_path):
    """
    Loads image dictionary data from a given file.
    Parameters:
        file_path (str): Path to the pickle file containing image dictionaries.
    Returns:
        list: A list of dictionaries containing image metadata.
    """
    
    try:
        with open(file_path, 'rb') as file:
            return pickle.load(file)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return []
    except Exception as e:
        print(f"Error loading file: {e}")
        return []

# Define paths
BASE_PATH = ''
FILE_PATH = os.path.join(BASE_PATH, 'data\\image_dicts_256.pkl')
EXCEL_FILE_PATH = 'data\\sample_groups.xlsx'

# Load image dictionaries
image_dicts = load_image_dicts(FILE_PATH)

# Extract sample names from folders
samples = []
for tissue_type in ['Normal', 'Follicular', 'Papillary', 'Anaplastic']:
    tissue_path = os.path.join(BASE_PATH, tissue_type)
    if os.path.exists(tissue_path):
        for subfolder in os.listdir(tissue_path):
            if subfolder != ".DS_Store":
                samples.append(subfolder)
    else:
        print(f"Warning: Path not found for tissue type '{tissue_type}'")

# Exclude specific samples
EXCLUDE_LIST = ["SSW-23-14395_C2", "SSW-23-05363_A7"]
image_dicts = [img for img in image_dicts if not any(excl in img["name"] for excl in EXCLUDE_LIST)]

def load_sample_groups(excel_file_path):
    """
    Loads training, validation, and test sample groups from an Excel file.
    Parameters:
        excel_file_path (str): Path to the Excel file containing sample groups.
    Returns:
        tuple: Lists of training, validation, and test sample names.
    """
    
    try:
        groups_df = pd.read_excel(excel_file_path)
        train_samples = groups_df['Train Samples'].dropna().tolist()
        val_samples = groups_df['Validation Samples'].dropna().tolist()
        test_samples = groups_df['Test Samples'].dropna().tolist()
        return train_samples, val_samples, test_samples
    except FileNotFoundError:
        print(f"Error: Sample groups file not found at {excel_file_path}")
        return [], [], []
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return [], [], []

# Load sample groups
train_samples, val_samples, test_samples = load_sample_groups(EXCEL_FILE_PATH)

def categorize_images(image_data, train_samples, val_samples, test_samples):
    """
    Categorizes images into training, validation, and test sets based on sample names.

    Parameters:
    image_data (list): List of dictionaries containing image metadata.
    train_samples (list): List of sample names designated for training.
    val_samples (list): List of sample names designated for validation.
    test_samples (list): List of sample names designated for testing.

    Returns:
    tuple: Lists of categorized image data for training, validation, and testing.
    """
    train_set, val_set, test_set = [], [], []
    for data in image_data:
        sample_name = " ".join(data['name'].split('_')[:2])  # Extract sample ID
        if sample_name in train_samples:
            train_set.append(data)
        elif sample_name in val_samples:
            val_set.append(data)
        elif sample_name in test_samples:
            test_set.append(data)
    return train_set, val_set, test_set

# Categorize images into train, validation, and test sets
train_set, val_set, test_set = categorize_images(image_dicts, train_samples, val_samples, test_samples)

# Shuffle datasets
train_set = shuffle(train_set, random_state=42)
val_set = shuffle(val_set, random_state=42)
test_set = shuffle(test_set, random_state=42)

def get_base_name(name):
    """
    Extracts the base name from an image filename before the "DOCI_n" part.
    Parameters:
        name (str): The image filename.
    Returns:
        str: The base name extracted from the filename.
    """
    return name.split('_DOCI')[0]

def get_doci_number(name):
    """
    Extracts the DOCI number (n) from an image filename.
    Parameters:
        name (str): The image filename.
    Returns:
        int: The extracted DOCI number, or -1 if not found.
    """
    
    match = re.search(r'_DOCI_(\d+)', name)
    return int(match.group(1)) if match else -1

# Averaging after PCA

Comparing normal all tissue (from tissue_cutout) with cancer regions of cancer samples (from mask)

## 2D

In [None]:
# Function to group images by sample and ensure correct order
def group_images_by_sample(image_set):
    """
    Groups images by their sample name and ensures correct order.
    Parameters:
        image_set (list): A list of dictionaries, each representing an image with a 'name' key.
    Returns:
        dict: A dictionary where keys are sample names and values are lists of image dictionaries sorted by order.
    """
    
    print("Grouping images by sample...")
    grouped_samples = defaultdict(list)
    for img_dict in image_set:
        base_name = get_base_name(img_dict['name'])
        if base_name.replace("_", " ") not in test_samples:
            grouped_samples[base_name].append(img_dict)
    
    # Sort images within each sample
    for key in grouped_samples:
        grouped_samples[key] = sorted(grouped_samples[key], key=lambda x: get_doci_number(x['name']))
    
    print(f"Grouped {len(grouped_samples)} samples.")
    return grouped_samples

# Function to create a mask for non-black pixels in image_cutoff
def create_tissue_mask(image_cutoff, black_tolerance=5):
    """
    Creates a binary mask for non-black pixels in the given image.
    Parameters:
        image_cutoff (numpy array): The input image.
        black_tolerance (int): Threshold for defining black pixels.
    Returns:
        numpy array: A binary mask where non-black pixels are marked as 1.
    """
    
    img_array = np.array(image_cutoff)
    tissue_mask = (img_array > black_tolerance).astype(np.uint8)
    return tissue_mask

# Aggregate pixel values into a 23-length vector per pixel location
def aggregate_pixel_vectors(sample_images):
    """
    Aggregates pixel values from a set of sample images into 23-length vectors.
    Parameters:
        sample_images (list): A list of image dictionaries containing grayscale image data.
    Returns:
        numpy array: An array where each row represents a 23-length pixel vector.
    """
    
    print("Aggregating pixel vectors...")
    img_shape = np.array(sample_images[0]['grayscale']).shape
    height, width = img_shape
    pixel_vectors = []

    # Create masks: one for Normal samples and one common for all images in the sample
    normal_mask = None
    img_arrays = np.array([np.array(img_dict['grayscale']) for img_dict in sample_images])

    # Pre-compute mask for non-Normal samples outside the pixel loops
    mask = np.array(sample_images[0]['mask']).astype(np.uint8) if sample_images[0]['tissue_type'] != 'Normal' else None

    # Iterate through each pixel location
    for y in range(height):
        for x in range(width):
            pixel_values = []

            if sample_images[0]['tissue_type'] == 'Normal':
                if normal_mask is None:
                    img_cutoff = np.array(sample_images[0]['image_grayscale_cutoff'])
                    normal_mask = create_tissue_mask(img_cutoff)
                    print("Created tissue mask for Normal sample.")

                if normal_mask[y, x] == 1:
                    pixel_values = img_arrays[:, y, x]
            else:
                if mask[y, x] > 0:
                    pixel_values = img_arrays[:, y, x]

            # Only consider pixel locations that have valid values
            if len(pixel_values) == 23:
                pixel_vectors.append(pixel_values)

    print(f"Aggregated pixel vectors: {len(pixel_vectors)} valid pixels found.")
    return np.array(pixel_vectors)

def prepare_data_for_pca(grouped_samples):
    """
    Prepares pixel vector data for PCA by aggregating values and assigning labels.
    Parameters:
        grouped_samples (dict): Dictionary of grouped images by sample.
    Returns:
        tuple: (combined pixel vectors, sample labels, tissue types).
    """
    
    print("Preparing data for PCA...")
    all_pixel_vectors = []
    sample_labels = []
    tissue_types = []

    for sample_name, sample_images in grouped_samples.items():
        aggregated_pixel_vectors = aggregate_pixel_vectors(sample_images)
        if aggregated_pixel_vectors.size > 0:
            all_pixel_vectors.append(aggregated_pixel_vectors)

            # Extend sample labels and tissue types for the valid pixel vectors only
            num_valid_vectors = aggregated_pixel_vectors.shape[0]
            sample_labels.extend([sample_name] * num_valid_vectors)
            tissue_types.extend([sample_images[0]['tissue_type']] * num_valid_vectors)

    # Concatenate all pixel vectors for PCA
    if all_pixel_vectors:
        combined_samples = np.concatenate(all_pixel_vectors, axis=0)
        print(f"Combined pixel vectors for PCA with shape: {combined_samples.shape}.")
    else:
        combined_samples = np.array([])

    # Ensure labels and tissue types match the number of valid samples
    assert len(sample_labels) == combined_samples.shape[0], "Mismatch between labels and PCA input."
    assert len(tissue_types) == combined_samples.shape[0], "Mismatch between tissue types and PCA input."

    print(f"Prepared data for {len(all_pixel_vectors)} samples.")
    return combined_samples, sample_labels, tissue_types


# Perform PCA on the aggregated samples
def perform_pca_on_samples(aggregated_samples, n_components=2):
    """
    Performs PCA on aggregated sample pixel data.
    Parameters:
        aggregated_samples (numpy array): The aggregated pixel vectors.
        n_components (int): Number of PCA components to retain.
    Returns:
        numpy array: PCA transformed results.
    """
    
    print("Performing PCA...")
    if aggregated_samples.size == 0:
        print("No valid samples to perform PCA.")
        return np.array([])

    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(aggregated_samples)
    print("PCA completed.")
    return pca_result

# Average PCA results for each sample (element-wise)
def average_pca_per_sample(pca_result, sample_labels, tissue_types):
    """
    Computes the average PCA values for each unique sample.
    Parameters:
        pca_result (numpy array): The PCA transformed data.
        sample_labels (list): Labels indicating which sample each PCA result belongs to.
        tissue_types (list): Corresponding tissue type for each sample.
    Returns:
        tuple: (numpy array of averaged PCA values, list of corresponding tissue types).
    """
    
    print("Averaging PCA results per sample...")
    averaged_pca = []
    averaged_tissue_types = []
    unique_samples = set(sample_labels)

    for sample in unique_samples:
        indices = [i for i, label in enumerate(sample_labels) if label == sample]
        sample_pca = pca_result[indices]

        if len(sample_pca) > 0:
            averaged_pca.append(np.mean(sample_pca, axis=0))
            averaged_tissue_types.append(tissue_types[indices[0]])

    print(f"Averaged PCA results for {len(averaged_pca)} samples.")
    return np.array(averaged_pca), averaged_tissue_types
    

def plot_pca_results(pca_result, tissue_types):
    """
    Plots the PCA results with color-coded tissue types.
    Parameters:
        pca_result (numpy array): PCA transformed data.
        tissue_types (list): List of tissue type labels.
    """
    
    if pca_result.size == 0:
        print("No PCA results to plot.")
        return

    print("Plotting PCA results...")
    plt.figure(figsize=(10, 8))

    # Color map for tissue types
    tissue_type_colors = {'Normal': 'blue', 'Follicular': 'orange', 'Papillary': 'green', 'Anaplastic': 'red'}
    colors = [tissue_type_colors[tissue] for tissue in tissue_types]

    # Average PCA coordinates for samples and plot
    averaged_pca, averaged_tissue_types = average_pca_per_sample(pca_result, sample_labels, tissue_types)
    scatter = plt.scatter(averaged_pca[:, 0], averaged_pca[:, 1], c=[tissue_type_colors[tissue] for tissue in averaged_tissue_types], marker='X')

    # Create legend
    handles = [plt.Line2D([0], [0], marker='X', color='w', label=tissue, 
                           markerfacecolor=color, markersize=10) for tissue, color in tissue_type_colors.items()]
    plt.legend(handles=handles, title="Tissue Type")

    plt.title("PCA of Samples Based on Aggregated Pixel Values")
    plt.xlabel("PC 1")
    plt.ylabel("PC 2")
    plt.show()
    print("PCA results plotted.")


# Main execution flow
file_path = 'data/image_dicts_256_wgrayscale_andcutoffs.pkl'
image_dicts = load_image_dicts(file_path)

# Group images by sample
grouped_samples = group_images_by_sample(image_dicts)

# Prepare the data for PCA
all_samples, sample_labels, tissue_types = prepare_data_for_pca(grouped_samples)


def plot_raw_pca_results(pca_result, tissue_types, sample_labels):
    """
    Plots raw PCA results for individual pixel locations.
    Parameters:
        pca_result (numpy array): PCA transformed data.
        tissue_types (list): List of tissue type labels.
        sample_labels (list): List of sample labels.
    """
    
    if pca_result.size == 0:
        print("No PCA results to plot.")
        return

    print("Plotting raw PCA results (per pixel)...")
    plt.figure(figsize=(10, 8))

    # Color map for tissue types
    tissue_type_colors = {'Normal': 'blue', 'Follicular': 'orange', 'Papillary': 'green', 'Anaplastic': 'red'}
    colors = [tissue_type_colors[tissue] for tissue in tissue_types]

    scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=colors, s=1, alpha=0.5)

    # Create legend
    handles = [plt.Line2D([0], [0], marker='o', color='w', label=tissue, 
                           markerfacecolor=color, markersize=10) for tissue, color in tissue_type_colors.items()]
    plt.legend(handles=handles, title="Tissue Type", loc='upper right')

    plt.title("Raw PCA of Pixel Values")
    plt.xlabel("PC 1")
    plt.ylabel("PC 2")
    plt.show()
    print("Raw PCA results plotted.")

# Perform PCA
pca_result = perform_pca_on_samples(all_samples)

# Plot raw PCA results for all pixel locations
plot_raw_pca_results(pca_result, tissue_types, sample_labels)

# Plot averaged PCA results
plot_pca_results(pca_result, tissue_types)

In [None]:
# Perform PCA on the aggregated samples and compute explained variance
def perform_pca_with_variance(aggregated_samples, n_components=10):
    """
    Performs Principal Component Analysis (PCA) on the given aggregated samples 
    and computes the explained variance ratio for each principal component.
    Parameters:
        aggregated_samples (numpy.ndarray): The dataset to perform PCA on, with samples as rows.
        n_components (int, optional): The number of principal components to compute. Default is 10.

    Returns:
        tuple: 
        - numpy.ndarray: Transformed data after applying PCA.
        - list: Explained variance ratio for each principal component.
    """
    
    print("Performing PCA with variance computation...")
    if aggregated_samples.size == 0:
        print("No valid samples to perform PCA.")
        return np.array([]), []

    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(aggregated_samples)
    explained_variance = pca.explained_variance_ratio_
    print(f"PCA completed. Explained variance by components: {explained_variance}")
    return pca_result, explained_variance


# Plot explained variance ratio (sideways with PC 1 at the top)
def plot_explained_variance(explained_variance):
    """
    Plots the explained variance ratio of the principal components as a horizontal bar chart.
    Parameters:
        explained_variance (list or numpy.ndarray): The explained variance ratio for each principal component.
    """
    
    print("Plotting explained variance ratio...")
    plt.figure(figsize=(8, 12))
    
    components = range(1, len(explained_variance) + 1)
    plt.barh(
        components, 
        explained_variance, 
        color='skyblue'  # Set bar color to sky blue
    )
    
    # Add title, labels, and ticks with increased font size
    plt.title("Explained Variance by Principal Components", fontsize=20)
    plt.ylabel("Principal Components", fontsize=19)
    plt.xlabel("Explained Variance Ratio", fontsize=19)
    plt.yticks(components, labels=[f'PC {i}' for i in components], fontsize=16)
    plt.xticks(fontsize=12)
    
    # Reverse the order of the y-axis to make PC 1 appear at the top
    plt.gca().invert_yaxis()
    
    # Customize grid lines for better visibility
    plt.grid(axis='x', linestyle='--', alpha=0.9)

    plt.tight_layout()
    plt.show()
    print("Explained variance ratio plotted.")


# Perform PCA and compute explained variance
pca_result, explained_variance = perform_pca_with_variance(all_samples)

# Plot explained variance
if explained_variance.size > 0:
    plot_explained_variance(explained_variance)

In [None]:
# Plot absolute PCA loadings for PC1 and PC2 as stacked bars
def plot_absolute_pca_loadings(pca):
    """
    Plots the absolute PCA loadings for the first two principal components (PC1 and PC2)
    as a stacked bar chart.

    Parameters:
        pca (sklearn.decomposition.PCA): The PCA object after fitting the data.
    """
    
    print("Plotting absolute PCA loadings for PC1 and PC2 (stacked)...")
    plt.figure(figsize=(12, 6))
    x_ticks = [f"Filter {i+1}" for i in range(pca.components_.shape[1])]

    # Absolute loadings for PC1 and PC2
    pc1_loadings = np.abs(pca.components_[0])
    pc2_loadings = np.abs(pca.components_[1])

    # Plot stacked bars
    plt.bar(x_ticks, pc1_loadings, alpha=0.7, label='PC1')
    plt.bar(x_ticks, pc2_loadings, alpha=0.7, label='PC2', bottom=pc1_loadings)

    plt.title("Absolute PCA Loadings for PC1 and PC2")
    plt.xlabel("Filters")
    plt.ylabel("Absolute Loadings")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
    print("Stacked absolute PCA loadings plotted.")

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(all_samples)

# Plot stacked absolute PCA loadings
plot_absolute_pca_loadings(pca)

In [None]:
sample_labels_unique = set(sample_labels)

In [None]:
def group_images_by_sample2(image_set):
    """
    Groups images by their corresponding sample based on their base names.
    Parameters:
        image_set (list of dict): A list of image dictionaries containing metadata, including 'name'.
    Returns:
        dict: A dictionary where the keys are base sample names, and values are lists of image dictionaries
          corresponding to that sample.
    """
    
    print("Grouping images by sample...")
    grouped_samples = defaultdict(list)
    for img_dict in image_set:
        base_name = get_base_name(img_dict['name'])
        if base_name.replace("_", " ") in test_samples:
            grouped_samples[base_name].append(img_dict)
    
    # Sort images within each sample
    for key in grouped_samples:
        grouped_samples[key] = sorted(grouped_samples[key], key=lambda x: get_doci_number(x['name']))
    
    print(f"Grouped {len(grouped_samples)} samples.")
    return grouped_samples


# Main execution flow
file_path = 'data\\image_dicts_256_wgrayscale_andcutoffs.pkl'
image_dicts = load_image_dicts(file_path)
grouped_samples1 = group_images_by_sample2(image_dicts)

# Plotting Decision Boundaries

In [None]:
def plot_pca_with_boundaries_and_test(pca_result, tissue_types, grouped_test_samples, n_neighbors=):
    """
    Plots the PCA results with decision boundaries and test samples using a k-Nearest Neighbors (k-NN) classifier.
    
    This function visualizes the separation of tissue types in PCA space by:
    - Training a k-NN classifier on the PCA-transformed training data (excluding 'Anaplastic' tissue type).
    - Plotting decision boundaries based on the trained k-NN model.
    - Displaying training samples with different markers and colors based on tissue type.
    - Overlaying test samples and their corresponding classifications.
    - Calculating and printing training and test accuracy metrics.

    Parameters:
        pca_result (numpy.ndarray): The PCA-transformed feature set.
        tissue_types (list): A list of tissue type labels corresponding to the PCA-transformed training samples.
        grouped_test_samples (dict): A dictionary containing test samples grouped by their base sample name.
        n_neighbors (int): The number of neighbors to use for k-NN classification.
    """"
    
    print("Plotting PCA results with decision boundaries and test samples...")
    plt.figure(figsize=(8, 8))

    # Color map for tissue types
    tissue_type_colors = {'Normal': 'blue', 'Follicular': 'orange', 'Papillary': 'green', 'Anaplastic': 'red'}

    # Average PCA coordinates for training samples
    averaged_pca_train, averaged_tissue_types_train = average_pca_per_sample(pca_result, sample_labels, tissue_types)

    # Exclude 'Anaplastic' points from the training data
    non_anaplastic_indices = [i for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic']
    X_train = np.array([averaged_pca_train[i] for i in non_anaplastic_indices])  # Features
    y_train = np.array([list(tissue_type_colors.keys()).index(tissue) for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic'])  # Labels

    # Train k-NN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Adjust the plot limits to include "Anaplastic" samples
    anaplastic_points = np.array([averaged_pca_train[i] for i, tissue in enumerate(averaged_tissue_types_train) if tissue == 'Anaplastic'])
    x_min = min(X_train[:, 0].min(), anaplastic_points[:, 0].min()) - 10
    x_max = max(X_train[:, 0].max(), anaplastic_points[:, 0].max()) + 10
    y_min = min(X_train[:, 1].min(), anaplastic_points[:, 1].min()) - 10
    y_max = max(X_train[:, 1].max(), anaplastic_points[:, 1].max()) + 10
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    Z_cleaned = preprocess_and_remove_islands_v5(Z, size_threshold=400)

    # Generate a colormap dynamically for the unique areas in Z_cleaned
    unique_areas = np.unique(Z_cleaned)
    cmap = ListedColormap([tissue_type_colors[t] for t in tissue_type_colors.keys() if t != 'Anaplastic'])
    
    # Map unique areas to sequential indices for coloring
    area_to_color = {area: idx for idx, area in enumerate(unique_areas)}
    Z_mapped = np.vectorize(area_to_color.get)(Z_cleaned)
    
    # Plot decision boundaries
    plt.contourf(xx, yy, Z_mapped, alpha=0.3, cmap=cmap)
    
    plt.scatter(X_train[:, 0], X_train[:, 1],
                c=[tissue_type_colors[tissue] for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic'], 
                marker='X', label='Training Samples')
    
    
    # Plot "Anaplastic" training samples
    if len(anaplastic_points) > 0:
        plt.scatter(anaplastic_points[:, 0], anaplastic_points[:, 1],
                    c='red', marker='X', label='Training: Anaplastic')
    
    # Prepare test samples
    test_samples, test_labels, test_tissue_types = prepare_data_for_pca(grouped_test_samples)
    test_pca = perform_pca_on_samples(test_samples, n_components=2)
    
    # Average PCA coordinates for test samples
    averaged_pca_test, averaged_tissue_types_test = average_pca_per_sample(test_pca, test_labels, test_tissue_types)
    
    # Combine training and test samples for prediction printing
    combined_samples = np.vstack((averaged_pca_train, averaged_pca_test))
    combined_tissue_types = averaged_tissue_types_train + averaged_tissue_types_test
    combined_names = list(sample_labels_unique) + list(set(test_labels))
    
    
    # Predict labels for combined samples
    combined_predictions = knn.predict(np.array(combined_samples))
    combined_predicted_tissue_types = [list(tissue_type_colors.keys())[pred] for pred in combined_predictions]
    
    prediction_df = pd.DataFrame({
        'Sample Name': combined_names,
        'True Tissue Type': combined_tissue_types,
        'Predicted Tissue Type': combined_predicted_tissue_types
    })
    
    print("Prediction Results:")
    print(prediction_df)
    
    # Plot test samples
    plt.scatter(np.array(averaged_pca_test)[:, 0], np.array(averaged_pca_test)[:, 1],
                c=[tissue_type_colors[tissue] for tissue in averaged_tissue_types_test], 
                marker='o', edgecolor='k', label='Test Samples')
    
    # Create legend
    handles = [
        plt.Line2D([0], [0], marker='X', color='w', label=f'Training: {tissue}',
                   markerfacecolor=color, markersize=10, markeredgecolor='k')
        for tissue, color in tissue_type_colors.items()
    ] + [
        plt.Line2D([0], [0], marker='o', color='k', label='Test Samples', markerfacecolor='none', markersize=10)
    ]
    plt.legend(handles=handles, title="Tissue Type")
    
    plt.title("PCA of Samples with Decision Boundaries (Training and Test)")
    plt.xlabel("PC 1")
    plt.ylabel("PC 2")
    plt.show()
    print("Decision boundaries plotted with test samples.")
    
    # Calculate accuracy for the training set
    y_pred_train = knn.predict(np.array(averaged_pca_train))
    
    anaplastic_indices_train = [i for i, tissue in enumerate(averaged_tissue_types_train) if tissue == 'Anaplastic']
    success_count_train_anaplastic = sum((y_pred_train[i] != list(tissue_type_colors.keys()).index('Normal')) for i in anaplastic_indices_train)
    total_anaplastic_train = len(anaplastic_indices_train)
    
    # Non-Anaplastic accuracy for training
    y_train_non_anaplastic = np.array([list(tissue_type_colors.keys()).index(tissue) for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic'])
    y_pred_train_non_anaplastic = [y_pred_train[i] for i in non_anaplastic_indices]
    accuracy_train_non_anaplastic = accuracy_score(y_train_non_anaplastic, y_pred_train_non_anaplastic)
    
    # Combine non-Anaplastic accuracy with Anaplastic success rate
    if total_anaplastic_train > 0:
        anaplastic_train_success_rate = success_count_train_anaplastic / total_anaplastic_train
        accuracy_train_inclusive = (accuracy_train_non_anaplastic * len(y_train_non_anaplastic) + success_count_train_anaplastic) / len(averaged_tissue_types_train)
    else:
        anaplastic_train_success_rate = 0
        accuracy_train_inclusive = accuracy_train_non_anaplastic
    
    print(f"Training Accuracy excluding 'Anaplastic': {accuracy_train_non_anaplastic:.2f}")
    print(f"Training Accuracy including 'Anaplastic': {accuracy_train_inclusive:.2f}")
    print(f"Training Success rate for 'Anaplastic' classification: {anaplastic_train_success_rate:.2f}")
    
    
    # Calculate accuracy for the test set
    y_pred_test = knn.predict(np.array(averaged_pca_test))
    
    # Accuracy excluding "Anaplastic" in test set
    non_anaplastic_test_indices = [i for i, tissue in enumerate(averaged_tissue_types_test) if tissue != 'Anaplastic']
    y_test_non_anaplastic = np.array([list(tissue_type_colors.keys()).index(averaged_tissue_types_test[i]) for i in non_anaplastic_test_indices])
    y_pred_test_non_anaplastic = [y_pred_test[i] for i in non_anaplastic_test_indices]
    accuracy_test_non_anaplastic = accuracy_score(y_test_non_anaplastic, y_pred_test_non_anaplastic)
    print(f"Test Accuracy excluding 'Anaplastic': {accuracy_test_non_anaplastic:.2f}")
    
    # Test Accuracy including "Anaplastic" (non-Normal success)
    anaplastic_indices_test = [i for i, tissue in enumerate(averaged_tissue_types_test) if tissue == 'Anaplastic']
    success_count_test_anaplastic = sum((y_pred_test[i] != list(tissue_type_colors.keys()).index('Normal')) for i in anaplastic_indices_test)
    total_anaplastic_test = len(anaplastic_indices_test)
    
    # Non-Anaplastic accuracy for test set
    if total_anaplastic_test > 0:
        anaplastic_test_success_rate = success_count_test_anaplastic / total_anaplastic_test
    else:
        anaplastic_test_success_rate = 0
    
    # Combine non-Anaplastic accuracy with Anaplastic success rate
    accuracy_test_inclusive = (
        (accuracy_test_non_anaplastic * len(y_test_non_anaplastic) + success_count_test_anaplastic)
        / len(averaged_tissue_types_test)
    )
    
    print(f"Test Accuracy including 'Anaplastic': {accuracy_test_inclusive:.2f}")
    print(f"Test Success rate for 'Anaplastic' classification: {anaplastic_test_success_rate:.2f}")


def custom_label(decision_boundary):
    """
    Custom labeling function using flood-fill to label connected regions.

    Parameters:
    - decision_boundary: 2D numpy array containing class predictions.

    Returns:
    - labeled_array: 2D numpy array where each connected region has a unique label.
    - num_features: Number of connected regions.
    """
    rows, cols = decision_boundary.shape
    labeled_array = np.zeros_like(decision_boundary, dtype=int)
    current_label = 1

    def flood_fill(start_row, start_col, value):
        queue = deque([(start_row, start_col)])
        labeled_array[start_row, start_col] = current_label
        while queue:
            r, c = queue.popleft()
            for dr, dc in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
                nr, nc = r + dr, c + dc
                if (
                    0 <= nr < rows
                    and 0 <= nc < cols
                    and decision_boundary[nr, nc] == value
                    and labeled_array[nr, nc] == 0
                ):
                    labeled_array[nr, nc] = current_label
                    queue.append((nr, nc))

    for r in range(rows):
        for c in range(cols):
            if labeled_array[r, c] == 0:  # Unvisited pixel
                flood_fill(r, c, decision_boundary[r, c])
                current_label += 1

    num_features = current_label - 1
    return labeled_array, num_features


def preprocess_and_remove_islands_v5(decision_boundary, size_threshold=300, distance=2, num_classes=3):
    """
    Preprocesses the decision boundary to remove artifacts and eliminates small isolated regions ("islands").

    Parameters:
    - decision_boundary: 2D numpy array containing the class predictions for each grid point.
    - size_threshold: The maximum size of regions to be considered as islands. Smaller regions will be removed.
    - distance: The distance from the region boundary to consider for determining the majority class.
    - num_classes: Number of discrete classes in the decision boundary.

    Returns:
    - A modified decision_boundary with islands removed.
    """

    # Discretize the decision boundary into `num_classes` bins
    decision_boundary_min = np.min(decision_boundary)
    decision_boundary_max = np.max(decision_boundary)
    bins = np.linspace(decision_boundary_min, decision_boundary_max, num_classes + 1)
    decision_boundary = np.digitize(decision_boundary, bins) - 1

    # Label contiguous regions in the decision boundary
    labeled_regions, num_features = custom_label(decision_boundary)

    # Copy the decision boundary for modifications
    cleaned_boundary = decision_boundary.copy()

    # Counter for removed islands
    removed_islands_count = 0

    # Analyze each region
    for region_id in range(1, num_features + 1):
        # Get the region mask
        region_mask = (labeled_regions == region_id)
        region_size = np.sum(region_mask)

        # Check if the region is an island (below size threshold)
        if region_size < size_threshold:
            removed_islands_count += 1

            # Expand the mask to determine the majority class
            expanded_mask = binary_dilation(region_mask, iterations=distance)
            surrounding_mask = expanded_mask & ~region_mask
            surrounding_classes = decision_boundary[surrounding_mask]

            if len(surrounding_classes) > 0:
                majority_class = np.bincount(surrounding_classes).argmax()
                cleaned_boundary[region_mask] = majority_class

    return cleaned_boundary

# Main execution flow
file_path = 'C:\\Users\\Tyler\\CNN Project - Take 2\\CNN_Data_ver2\\image_dicts_256_wgrayscale_andcutoffs.pkl'
image_dicts = load_image_dicts(file_path)
grouped_samples1 = group_images_by_sample2(image_dicts)

# Perform PCA with 2 components
pca_result = perform_pca_on_samples(all_samples, n_components=2)

# Plot boundaries with both training and test samples
plot_pca_with_boundaries_and_test(pca_result, tissue_types, grouped_samples1)

In [None]:
def plot_pca_with_boundaries_and_test(pca_result, tissue_types, grouped_test_samples, n_neighbors=7):
    """
    Plots PCA results with decision boundaries and test samples using a k-Nearest Neighbors (k-NN) classifier.

    This function:
    - Trains a k-NN classifier on PCA-transformed training data, excluding 'Anaplastic' samples.
    - Plots decision boundaries using a cleaned classification grid.
    - Displays training samples using different markers and colors based on tissue type.
    - Computes and overlays test sample classifications.
    - Remaps labels into three categories ('Normal', 'Follicular', and 'Papillary') for evaluation.
    - Computes and visualizes confusion matrices for both training and test datasets.

    Parameters:
        pca_result (numpy.ndarray): The PCA-transformed feature set for training samples.
        tissue_types (list): A list of tissue type labels corresponding to the training samples.
        grouped_test_samples (dict): A dictionary containing test samples grouped by their base sample name.
        n_neighbors (int, optional): The number of neighbors to use in the k-NN classifier. Default is 7.
    """
    
    print("Plotting PCA results with decision boundaries and test samples...")
    #plt.figure(figsize=(12, 8))

    # Color map for tissue types
    tissue_type_colors = {'Normal': 'blue', 'Follicular': 'orange', 'Papillary': 'green', 'Anaplastic': 'red'}

    # Average PCA coordinates for training samples
    averaged_pca_train, averaged_tissue_types_train = average_pca_per_sample(pca_result, sample_labels, tissue_types)

    # Exclude 'Anaplastic' points from the training data
    non_anaplastic_indices = [i for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic']
    X_train = np.array([averaged_pca_train[i] for i in non_anaplastic_indices])  # Features
    y_train = np.array([list(tissue_type_colors.keys()).index(tissue) for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic'])  # Labels

    # Train k-NN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Adjust the plot limits to include "Anaplastic" samples
    anaplastic_points = np.array([averaged_pca_train[i] for i, tissue in enumerate(averaged_tissue_types_train) if tissue == 'Anaplastic'])
    x_min = min(X_train[:, 0].min(), anaplastic_points[:, 0].min()) - 10
    x_max = max(X_train[:, 0].max(), anaplastic_points[:, 0].max()) + 10
    y_min = min(X_train[:, 1].min(), anaplastic_points[:, 1].min()) - 10
    y_max = max(X_train[:, 1].max(), anaplastic_points[:, 1].max()) + 10
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    Z_cleaned = preprocess_and_remove_islands_v5(Z, size_threshold=400)

    unique_areas = np.unique(Z_cleaned)
    cmap = ListedColormap([tissue_type_colors[t] for t in tissue_type_colors.keys() if t != 'Anaplastic'])  # Use the first N colors from a colormap
    
    area_to_color = {area: idx for idx, area in enumerate(unique_areas)}
    Z_mapped = np.vectorize(area_to_color.get)(Z_cleaned)  # Map Z_cleaned to the new color indices

    # Prepare test samples
    test_samples, test_labels, test_tissue_types = prepare_data_for_pca(grouped_test_samples)
    test_pca = perform_pca_on_samples(test_samples, n_components=2)

    # Average PCA coordinates for test samples
    averaged_pca_test, averaged_tissue_types_test = average_pca_per_sample(test_pca, test_labels, test_tissue_types)

    # Remap labels to 3 categories
    # Adjusted remap_labels function
    def remap_labels(y_true, y_pred, tissue_type_colors):
        mapping = {'Normal': 0, 'Follicular': 1, 'Papillary': 2}
        y_true_mapped = []
        y_pred_mapped = []
        
        for true, pred in zip(y_true, y_pred):
            true_label = list(tissue_type_colors.keys())[true]
            pred_label = list(tissue_type_colors.keys())[pred]
            
            # Adjust true label for 'Anaplastic'
            if true_label == 'Anaplastic':
                if pred_label == 'Follicular':
                    y_true_mapped.append(mapping['Follicular'])
                elif pred_label == 'Papillary':
                    y_true_mapped.append(mapping['Papillary'])
                else:
                    y_true_mapped.append(mapping['Anaplastic'])
            else:
                y_true_mapped.append(mapping[true_label])
            
            # Remap predicted label
            if pred_label == 'Anaplastic':
                y_pred_mapped.append(mapping['Anaplastic'])
            else:
                y_pred_mapped.append(mapping[pred_label])
    
        return y_true_mapped, y_pred_mapped

    # Predict and remap labels for training set
    y_pred_train = knn.predict(X_train)
    y_true_train = y_train
    y_true_train_mapped, y_pred_train_mapped = remap_labels(y_true_train, y_pred_train, tissue_type_colors)

    # Predict and remap labels for test set
    y_pred_test = knn.predict(np.array(averaged_pca_test))
    y_true_test = np.array([list(tissue_type_colors.keys()).index(tissue) for tissue in averaged_tissue_types_test])
    y_true_test_mapped, y_pred_test_mapped = remap_labels(y_true_test, y_pred_test, tissue_type_colors)

    # Function to plot confusion matrix
    def plot_confusion_matrix(y_true, y_pred, title):
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
        class_names = ['Normal', 'Follicular', 'Papillary']
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names, cmap='Blues', 
                    annot_kws={"size": 20})
        plt.title(title, fontsize=16)
        plt.xlabel('Predicted Label', fontsize=16)
        plt.ylabel('True Label', fontsize=16)
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
        plt.show()
        print(classification_report(y_true, y_pred, target_names=class_names))


    # Plot confusion matrix for training
    print("\nTraining Set Confusion Matrix:")
    plot_confusion_matrix(y_true_train_mapped, y_pred_train_mapped, "Confusion Matrix - Training Set")

    # Plot confusion matrix for test
    print("\nTest Set Confusion Matrix:")
    plot_confusion_matrix(y_true_test_mapped, y_pred_test_mapped, "Confusion Matrix - Test Set")

# Main execution flow
file_path = 'C:\\Users\\Tyler\\CNN Project - Take 2\\CNN_Data_ver2\\image_dicts_256_wgrayscale_andcutoffs.pkl'
image_dicts = load_image_dicts(file_path)
grouped_samples1 = group_images_by_sample2(image_dicts)

# Perform PCA with 2 components
pca_result = perform_pca_on_samples(all_samples, n_components=2)

# Plot boundaries with both training and test samples
plot_pca_with_boundaries_and_test(pca_result, tissue_types, grouped_samples1)

# Cross-Validation

In [None]:
def cross_validate_knn(pca_result, tissue_types, grouped_test_samples, max_neighbors=10):
    """
    Performs cross-validation to determine the optimal number of neighbors (k) for k-Nearest Neighbors (k-NN) classification.

    This function:
    - Computes the average PCA coordinates for training samples.
    - Excludes 'Anaplastic' samples from training but evaluates their classification separately.
    - Uses k-fold cross-validation (k=8) to evaluate training accuracy for different values of k.
    - Includes a custom accuracy metric that factors in 'Anaplastic' classification success.
    - Evaluates test set accuracy separately, considering 'Anaplastic' classification success.
    - Plots accuracy results across different values of k.
    - Returns the best value of k based on cross-validation results.

    Parameters:
        pca_result (numpy.ndarray): The PCA-transformed feature set for training samples.
        tissue_types (list): A list of tissue type labels corresponding to the training samples.
        grouped_test_samples (dict): A dictionary containing test samples grouped by their base sample name.
        max_neighbors (int, optional): The maximum number of neighbors to consider in cross-validation. Default is 10.

    Returns:
        int: The optimal number of neighbors (k) based on training accuracy.
    """

    print("Performing cross-validation to optimize n_neighbors...")
    
    # Color map for tissue types
    tissue_type_colors = {'Normal': 'blue', 'Follicular': 'orange', 'Papillary': 'green', 'Anaplastic': 'red'}

    # Average PCA coordinates for training samples
    averaged_pca_train, averaged_tissue_types_train = average_pca_per_sample(pca_result, sample_labels, tissue_types)

    # Prepare training data excluding "Anaplastic"
    non_anaplastic_indices = [i for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic']
    X_train = np.array([averaged_pca_train[i] for i in non_anaplastic_indices])  # Features
    y_train = np.array([list(tissue_type_colors.keys()).index(tissue) for i, tissue in enumerate(averaged_tissue_types_train) if tissue != 'Anaplastic'])  # Labels

    # Include "Anaplastic" samples for inclusive accuracy
    anaplastic_indices_train = [i for i, tissue in enumerate(averaged_tissue_types_train) if tissue == 'Anaplastic']
    X_train_anaplastic = np.array([averaged_pca_train[i] for i in anaplastic_indices_train])
    
    # Prepare test samples
    test_samples, test_labels, test_tissue_types = prepare_data_for_pca(grouped_test_samples)
    test_pca = perform_pca_on_samples(test_samples, n_components=2)
    averaged_pca_test, averaged_tissue_types_test = average_pca_per_sample(test_pca, test_labels, test_tissue_types)
    y_test = np.array([list(tissue_type_colors.keys()).index(tissue) for tissue in averaged_tissue_types_test])

    # Cross-validation loop
    kf = KFold(n_splits=8, shuffle=True, random_state=42)
    train_accuracies = []
    train_std_dev = []
    test_accuracies = []
    test_std_dev = []
    neighbors_range = range(1, max_neighbors + 1)

    for n_neighbors in neighbors_range:
        print(f"Evaluating n_neighbors={n_neighbors}...")
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        
        # Cross-validated training accuracy including "Anaplastic"
        cv_train_accuracy = []
        for train_idx, val_idx in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
            
            knn.fit(X_train_fold, y_train_fold)
            
            # Predict on validation set
            y_val_pred = knn.predict(X_val_fold)
            
            # Predict on "Anaplastic" in validation fold
            y_pred_anaplastic = knn.predict(X_train_anaplastic)
            success_anaplastic = sum((y_pred_anaplastic != list(tissue_type_colors.keys()).index('Normal')))
            
            # Combine non-Anaplastic accuracy with Anaplastic success
            non_anaplastic_acc = accuracy_score(y_val_fold, y_val_pred)
            combined_accuracy = (non_anaplastic_acc * len(y_val_fold) + success_anaplastic) / (len(y_val_fold) + len(X_train_anaplastic))
            cv_train_accuracy.append(combined_accuracy)
        
        train_accuracies.append(np.mean(cv_train_accuracy))
        train_std_dev.append(np.std(cv_train_accuracy))
        
        # Test set accuracy
        knn.fit(X_train, y_train)
        y_test_pred = knn.predict(np.array(averaged_pca_test))
        
        # Test accuracy including "Anaplastic"
        anaplastic_indices_test = [i for i, tissue in enumerate(averaged_tissue_types_test) if tissue == 'Anaplastic']
        success_count_test_anaplastic = sum((y_test_pred[i] != list(tissue_type_colors.keys()).index('Normal')) for i in anaplastic_indices_test)
        total_anaplastic_test = len(anaplastic_indices_test)
        
        if total_anaplastic_test > 0:
            anaplastic_test_success_rate = success_count_test_anaplastic / total_anaplastic_test
        else:
            anaplastic_test_success_rate = 0
        
        accuracy_test_non_anaplastic = accuracy_score(
            [y_test[i] for i in range(len(y_test)) if i not in anaplastic_indices_test],
            [y_test_pred[i] for i in range(len(y_test_pred)) if i not in anaplastic_indices_test]
        )
        
        combined_test_accuracy = (
            (accuracy_test_non_anaplastic * (len(y_test) - total_anaplastic_test) + success_count_test_anaplastic) / len(y_test)
        )
        test_accuracies.append(combined_test_accuracy)
        test_std_dev.append(0)  # Assuming no cross-validation for the test set

    # Plot results
    plt.figure(figsize=(12, 6))

    print(train_accuracies)
    # Error bars for training accuracy
    plt.errorbar(neighbors_range, train_accuracies, yerr=train_std_dev, 
                 label="Training Accuracy", 
                 marker='o', capsize=5, ecolor='lightblue', elinewidth=1.5)
    
    # Error bars for test accuracy
    plt.errorbar(neighbors_range, test_accuracies, yerr=test_std_dev, 
                 label="Test Accuracy", 
                 marker='x', capsize=5, ecolor='orange', elinewidth=1.5)
    
    # Axis labels and title with increased font size
    plt.xlabel("Number of Neighbors (K)", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
    plt.title("Cross-Validation for Optimizing K", fontsize=16)
    
    # Legend with larger font size
    plt.legend(fontsize=12)
    
    # Grid and larger tick labels
    plt.grid()
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    
    # Show plot
    plt.show()

    # Report the best n_neighbors
    best_n_neighbors = neighbors_range[np.argmax(train_accuracies)]
    print(f"Best n_neighbors based on training accuracy: {best_n_neighbors}")
    return best_n_neighbors


In [None]:
grouped_samples1 = group_images_by_sample2(image_dicts)
best_n_neighbors = cross_validate_knn(pca_result, tissue_types, grouped_samples1, max_neighbors=10)