In [None]:
from sortedcontainers import SortedList
import gzip
import shutil
import numpy as np

import imageio
import imgaug as ia
%matplotlib inline

import xml.etree.ElementTree as ET
from PIL import Image, ImageColor
import pickle
import os
import cv2

import matplotlib.pyplot as plt

from PIL import Image
import io
import pickle

# Creating Image Dictionaries

In [None]:
from PIL import Image, ImageDraw
import numpy as np
import xml.etree.ElementTree as ET

def find_xml_file(folder_path):
    """
    Searches for the first XML file in the specified folder.
    Args: 
        folder_path (str): Path to the folder containing XML files.
    Returns: 
        str or None: Full path of the first XML file found, or None if no XML file is found.
    """
    
    for file in os.listdir(folder_path):
        if file.endswith('.xml'):
            return os.path.join(folder_path, file)
    return None

def place_mask(mask_paths, image):
    """
    Places a tumor mask on an image using annotations from XML files.
    Args:
        mask_paths (list of str): List of paths to XML mask files containing polygon annotations.
        image (PIL.Image): The input image to apply the mask to.
    Returns:
        tuple: A tuple containing:
            - PIL.Image: The generated binary mask image with tumors marked.
            - PIL.Image: The resized version of the input image to match the mask dimensions.
    """
    
    # Get the original image size before resizing
    original_image_size = image.size

    # Define the target size
    target_image_size = (256, 256)

    # Calculate scaling factors
    scale_x = target_image_size[0] / original_image_size[0]
    scale_y = target_image_size[1] / original_image_size[1]

    # Resize the image
    if image.size != target_image_size:
        image = image.resize(target_image_size, Image.LANCZOS)

    # Initialize mask image
    mask_image = Image.new('1', target_image_size, 0)
    draw = ImageDraw.Draw(mask_image)

    for mask_path in mask_paths:
        regions = ET.parse(mask_path).getroot()[0][1]
        for child in regions:
            if child.attrib != {}:
                polygon_points = []
                for point in child[1]:
                    x = float(point.attrib['X'])
                    y = float(point.attrib['Y'])
                    # Scale the coordinates
                    x_scaled = x * scale_x
                    y_scaled = y * scale_y
                    polygon_points.append((x_scaled, y_scaled))
                # Draw the polygon
                draw.polygon(polygon_points, outline=1, fill=1)

    return mask_image, image

def resize_image(img, size=(256, 256)):
    """
    Resizes an image to the specified dimensions.
    Args:
        img (PIL.Image): The input image to resize.
        size (tuple, optional): Target size as (width, height). Defaults to (256, 256).
    Returns:
        PIL.Image: The resized image.
    """
    
    resized_img = img.resize(size, Image.LANCZOS)
    return resized_img

def normalize_image(pil_img):
    """
    Normalizes an image by standardizing pixel values and scaling them to [0, 255].
    Args:
        pil_img (PIL.Image): The input image to normalize.
    Returns:
        PIL.Image: The normalized image with pixel values rescaled to 8-bit format.
    """
    
    # Convert PIL Image to a NumPy array
    img_array = np.array(pil_img)
    mean = np.mean(img_array)
    std = np.std(img_array)
    normalized_array = (img_array - mean) / std
    normalized_image = (normalized_array - normalized_array.min()) / (normalized_array.max() - normalized_array.min()) * 255
    normalized_pil_img = Image.fromarray(normalized_image.astype('uint8'))
    return normalized_pil_img


# Function to apply morphological operations for boundary smoothing
def smooth_boundary(mask_array):
    """
    Applies morphological closing to smooth the boundaries of a binary mask.
    Args:
        mask_array (numpy.ndarray): Binary mask where objects are represented by 1s.
    Returns:
        numpy.ndarray: Smoothed mask with closed boundaries.
    """
    
    kernel = np.ones((5, 5), np.uint8)
    mask_closed = cv2.morphologyEx(mask_array, cv2.MORPH_CLOSE, kernel)
    return mask_closed

# Function to remove small objects (connected components) from the mask
def remove_small_objects(mask_array, min_size=500):
    """
    Removes small objects (connected components) from a binary mask.
    Args:
        mask_array (numpy.ndarray): Binary mask where objects are represented by 1s.
        min_size (int, optional): Minimum size threshold for objects to be retained. Defaults to 500 pixels.
    Returns:
        numpy.ndarray: Cleaned binary mask with only larger connected components.
    """
    
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_array.astype(np.uint8), connectivity=8)
    cleaned_mask = np.zeros_like(mask_array)
    for i in range(1, num_labels):
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels == i] = 1
    return cleaned_mask

# Function to check if a pixel is nearly black with tolerance
def is_nearly_black(pixel, tolerance=10):
    """
    Checks if a pixel is nearly black within a given tolerance.
    Args:
        pixel (numpy.ndarray or list): Pixel value(s) to check.
        tolerance (int, optional): Maximum intensity value considered as "nearly black". Defaults to 10.
    Returns:
        bool: True if the pixel is nearly black, otherwise False.
    """
    
    return np.all(pixel <= tolerance)

# Function to remove thin lines (like those seen in the image)
def remove_thin_lines(mask_array, line_width=5):
    """
    Removes thin lines from a binary mask using morphological erosion and dilation.
    Args:
        mask_array (numpy.ndarray): Binary mask where objects are represented by 1s.
        line_width (int, optional): Width of the lines to be removed. Defaults to 5 pixels.
    Returns:
        numpy.ndarray: Mask with thin lines removed.
    """
    
    rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (line_width, 1))
    mask_eroded = cv2.morphologyEx(mask_array, cv2.MORPH_ERODE, rect_kernel, iterations=1)
    mask_dilated = cv2.morphologyEx(mask_eroded, cv2.MORPH_DILATE, rect_kernel, iterations=1)
    return mask_dilated

# Function to create a triangular mask in the top-right corner
def create_top_right_triangle_mask(mask_array):
    """
    Creates a triangular mask in the top-right corner of an image.
    Args:
        mask_array (numpy.ndarray): Binary mask used to determine dimensions.
    Returns:
        numpy.ndarray: Binary mask with a filled triangle in the top-right corner.
    """
    
    h, w = mask_array.shape
    triangle_mask = np.zeros_like(mask_array, dtype=np.uint8)
    vertices = np.array([[w, 0], [w, h//4], [w-w//3, 0]], np.int32)
    cv2.fillPoly(triangle_mask, [vertices], 1)
    return triangle_mask

# Function to create a mask for the top few rows to remove stray pixels
def create_top_strip_mask(mask_array, strip_height=10):
    """
    Creates a horizontal mask covering the top strip of an image.
    Args:
        mask_array (numpy.ndarray): Binary mask used to determine dimensions.
        strip_height (int, optional): Height of the top strip in pixels. Defaults to 10.
    Returns:
        numpy.ndarray: Binary mask with a strip of ones at the top.
    """
    
    h, w = mask_array.shape
    strip_mask = np.zeros_like(mask_array, dtype=np.uint8)
    strip_mask[0:strip_height, :] = 1
    return strip_mask

# Function to remove the triangular chunk and the top strip of pixels
def remove_top_right_triangle_and_strip(mask_array, strip_height=10):
    """
    Removes a triangular region in the top-right corner and a horizontal strip at the top.
    Args:
        mask_array (numpy.ndarray): Binary mask where objects are represented by 1s.
        strip_height (int, optional): Height of the strip to be removed. Defaults to 10 pixels.
    Returns:
        numpy.ndarray: Mask with the specified triangular and strip regions removed.
    """
    
    triangle_mask = create_top_right_triangle_mask(mask_array)
    strip_mask = create_top_strip_mask(mask_array, strip_height)
    combined_mask = triangle_mask | strip_mask
    mask_array[combined_mask == 1] = 0
    return mask_array

# Function to process and improve image cutoffs (for "Normal" samples) with tolerance for black pixels
def process_image_cutoff(image_cutoff, black_tolerance=10):
    """
    Processes an image by removing black background areas and applying image cleanup operations.
    Args:
        image_cutoff (PIL.Image): Input image to process.
        black_tolerance (int, optional): Intensity threshold below which pixels are considered black. Defaults to 10.
    Returns:
        PIL.Image: Processed image with unwanted background removed and cleaned.
    """
    
    img_array = np.array(image_cutoff)
    tissue_mask = np.any(img_array > black_tolerance, axis=-1).astype(np.uint8)
    
    tissue_mask_smooth = smooth_boundary(tissue_mask)
    tissue_mask_cleaned = remove_small_objects(tissue_mask_smooth)
    
    tissue_mask_no_triangle = remove_top_right_triangle_and_strip(tissue_mask_cleaned)
    tissue_mask_final = remove_thin_lines(tissue_mask_no_triangle)

    highlighted_image_array = img_array.copy()
    highlighted_image_array[tissue_mask_final == 0] = [0, 0, 0]
    highlighted_image_array = Image.fromarray(highlighted_image_array)
    return highlighted_image_array

def create_image_dicts(base_path):
    """
    Creates a list of dictionaries containing image data, masks, and processed variations 
    for different tissue types.
    Args:
        base_path (str): Path to the base dataset directory containing subfolders 
                         for different tissue types.
    Returns:
        list of dict: A list where each dictionary contains:
            - 'name' (str): Unique name combining subfolder and image filename.
            - 'image' (PIL.Image): Original resized and normalized image.
            - 'image_cutoff' (PIL.Image): Processed version of the image with unwanted 
                                          background removed.
            - 'tissue_type' (str): Type of tissue ('Normal', 'Follicular', 'Papillary', 
                                   'Anaplastic').
            - 'mask' (PIL.Image): Corresponding binary mask for tumor regions.
            - 'grayscale' (PIL.Image): Grayscale version of the image.
    """
    
    # Dictionary to hold all image data
    image_data = []

    # Iterate through each tissue type
    for tissue_type in ['Normal', 'Follicular', 'Papillary', 'Anaplastic']:
        tissue_path = os.path.join(base_path, tissue_type)
        
        # Iterate through each subfolder in the tissue type
        for subfolder in os.listdir(tissue_path):
            if subfolder == ".DS_Store":
                pass
            else:
                subfolder_path = os.path.join(tissue_path, subfolder)
                mask_path = find_xml_file(subfolder_path)

                new_mask = True
                # Iterate through each image in the subfolder
                for image_file in os.listdir(subfolder_path):
                    if image_file.endswith('.tif'):
                        print(image_file)
                        image_path = os.path.join(subfolder_path, image_file)
                        image = Image.open(image_path)

                        if new_mask and tissue_type != 'Normal':
                            mask = place_mask([mask_path], image)[0]
                            if mask == "ERROR":
                                print(subfolder_path)
                            mask = resize_image(mask)
                            mask = np.expand_dims(mask, axis=-1)
                            new_mask = False
                        elif new_mask and tissue_type == 'Normal':
                            mask = Image.new('1', (256,256))
                            mask = np.expand_dims(mask, axis=-1)
                            new_mask = False
                            

                        if image.size != (256, 256):
                            image = resize_image(image)
                            image = normalize_image(image)


                        tissue_path_ver2 = os.path.join('C:\\Users\\Tyler\\CNN Project - Take 2\\CNN_Data_ver2', tissue_type)
                        subfolder_path_ver2 = os.path.join(tissue_path_ver2, subfolder.replace("_", " "))
                        image_cutoff_path = os.path.join(subfolder_path_ver2, image_file)
                        image_cutoff = Image.open(image_cutoff_path)
                        image_cutoff = process_image_cutoff(image_cutoff)
                        if image_cutoff.size != (256, 256):
                            image_cutoff = resize_image(image_cutoff)
                            image_cutoff = normalize_image(image_cutoff)

                        tissue_path_grayscale = os.path.join('C:\\Users\\Tyler\\CNN Project - Take 2\\CNN_Data_Grayscale', tissue_type)
                        subfolder_path_grayscale = os.path.join(tissue_path_grayscale, subfolder.replace("_", " "))
                        grayscale_path = os.path.join(subfolder_path_grayscale, image_file)
                        grayscale = Image.open(grayscale_path)
                        if grayscale.size != (256, 256):
                            grayscale = resize_image(grayscale)
                            grayscale = normalize_image(grayscale)

                        if str(type(mask)) != "<class 'PIL.Image.Image'>":
                            mask = Image.fromarray(np.squeeze(np.uint8(mask * 255), axis=-1), mode='L') 
                        
                        # Create a dictionary for each image
                        image_dict = {
                            'name': subfolder + "_" + image_file.replace(" ", "_"),
                            'image': image,
                            'image_cutoff': image_cutoff,
                            'tissue_type': tissue_type,
                            'mask': mask,
                            'grayscale': grayscale
                        }
                        
                        # Append the dictionary to the list
                        image_data.append(image_dict)

    return image_data

# Usage
base_path = os.path.join(os.getcwd(), "CNN_Data")
image_dicts = create_image_dicts(base_path)

filename = 'image_dicts_256_wgrayscale.pkl'

# Open the file in binary write mode and pickle the dictionary
with open(filename, 'wb') as file:
    pickle.dump(image_dicts, file)

print(f"Dictionary has been pickled and saved as {filename}")

# Image Cutoffs for Grayscale

In [None]:
# Function to load image_dicts from a file
import pickle
def load_image_dicts(file_path):
    """
    Loads a list of image dictionaries from a serialized pickle file.
    Args:
        file_path (str): Path to the pickle file containing the image dictionaries.
    Returns:
        list of dict: A list of dictionaries, each representing an image and its associated data.
    """
    
    with open(file_path, 'rb') as file:
        return pickle.load(file)

# Load the image_dicts
file_path = 'C:\\Users\\Tyler\\CNN Project - Take 2\\CNN_Data_ver2\\image_dicts_256_wgrayscale.pkl'
image_dicts = load_image_dicts(file_path)

# Function to create a mask for non-black pixels in image_cutoff
def create_tissue_mask(image_cutoff, black_tolerance=10):
    """
    Creates a binary mask identifying tissue regions by filtering out black background pixels.
    Args:
        image_cutoff (PIL.Image): Input image for mask generation.
        black_tolerance (int, optional): Threshold below which pixels are considered black. Defaults to 10.
    Returns:
        numpy.ndarray: Binary mask where tissue regions are marked as 1 and background as 0.
    """
    
    img_array = np.array(image_cutoff)
    tissue_mask = np.any(img_array > black_tolerance, axis=-1).astype(np.uint8)
    return tissue_mask

image_dicts_grayscale_cutoffs = []
for image_dict in image_dicts:
    mask = create_tissue_mask(image_dict["image_cutoff"])
    image_grayscale = image_dict["grayscale"].convert("L")
    image_dict["grayscale"] = image_grayscale
    
    overlay = np.array(image_grayscale)
    
    # Set non-tissue areas to a specific grayscale intensity
    overlay[~(mask == 1)] = 0
    
    # Convert back to an image
    image_dict["image_grayscale_cutoff"] = Image.fromarray(overlay)

In [None]:
def main():
    """Main function to execute the primary workflow."""
    base_path = os.path.join(os.getcwd(), "CNN_Data")
    image_dicts = create_image_dicts(base_path)
    
    filename = 'image_dicts_256_wgrayscale.pkl'
    with open(filename, 'wb') as file:
        pickle.dump(image_dicts, file)

    print(f"Dictionary has been pickled and saved as {filename}")

if __name__ == "__main__":
    main()
