### 1. Convert the .tif image to .png image for visualization

In [None]:
import os
from PIL import Image
import sys

# --- Function to convert a single TIF file ---
def convert_tif_to_png(input_filepath: str, output_filepath: str) -> bool:
    """
    Convert a .tif file to .png format and save it.

    Args:
        input_filepath: Path to the input .tif file.
        output_filepath: Path to save the output .png file.

    Returns:
        True if conversion was successful, False otherwise.
    """
    try:
        # Open the .tif file
        with Image.open(input_filepath) as img:
            # Convert image to RGB if it's not (PNG typically expects RGB or RGBA)
            # This can help avoid issues with specific TIFF formats (e.g., paletted)
            if img.mode != 'RGB' and img.mode != 'RGBA':
                 img = img.convert('RGB')

            # Save as .png
            # The optimize=True and quality=95 are optional but can help with file size
            img.save(output_filepath, format="PNG", optimize=True, quality=95)
            # print(f"Successfully converted {os.path.basename(input_filepath)}") # Optional: print success per file
        return True
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_filepath}", file=sys.stderr)
        return False
    except Exception as e:
        print(f"An error occurred processing {os.path.basename(input_filepath)}: {e}", file=sys.stderr)
        return False

# --- Function to process all TIF files in a directory ---
def process_image_directory(input_dir: str, output_dir: str):
    """
    Process all .tif files in an input directory, convert them to .png,
    and save them in an output directory.

    Args:
        input_dir: Path to the directory containing input .tif files.
        output_dir: Path to the directory where output .png files will be saved.
    """
    print(f"Processing images from: {input_dir}")
    print(f"Saving converted images to: {output_dir}")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # List all entries in the input directory
    entries = os.listdir(input_dir)
    print(f"Found {len(entries)} entries in {input_dir}")

    converted_count = 0
    failed_count = 0

    # Iterate through each entry
    for entry_name in entries:
        input_filepath = os.path.join(input_dir, entry_name)

        # Check if it's a file and ends with .tif (case-insensitive)
        if os.path.isfile(input_filepath) and entry_name.lower().endswith('.tif'):
            try:
                # Get the base filename without extension
                base_name = os.path.splitext(entry_name)[0]
                output_filename = f"{base_name}.png"
                output_filepath = os.path.join(output_dir, output_filename)

                # Perform the conversion
                print(f"Converting {entry_name}...") # Print before conversion
                if convert_tif_to_png(input_filepath, output_filepath):
                    converted_count += 1
                else:
                    failed_count += 1

            except Exception as e:
                # Catch potential errors during path manipulation or naming
                print(f"An unexpected error occurred processing entry {entry_name}: {e}", file=sys.stderr)
                failed_count += 1
        # else:
            # Optional: print messages for ignored entries (e.g., directories, non-tif files)
            # print(f"Skipping {entry_name} (not a .tif file)")

    print("-" * 30)
    print(f"Processing complete for {input_dir}")
    print(f"Successfully converted: {converted_count}")
    print(f"Failed to convert: {failed_count}")
    print("-" * 30)

# --- Main Execution Block (suitable for a notebook cell) ---

# Define the base paths for input and output
base_download_path = "./data/DSB2018/original" # Corrected 'orignal' to 'original'
base_save_path = "./data/DSB2018/visual"

# Define the splits to process (train and test)
splits = ['train', 'test', 'val']

# Process each split
for split in splits:
    input_dir = os.path.join(base_download_path, split, 'images')
    output_dir = os.path.join(base_save_path, split, 'images')

    # Check if the input directory exists before processing
    if os.path.isdir(input_dir):
        process_image_directory(input_dir, output_dir)
    else:
        print(f"\n--- Skipping {split} split ---")
        print(f"Input directory not found: {input_dir}")
        print("-" * 30)

print("\nAll specified splits processed.")

### 2. Convert .tif mask to .npy and .png format for visualization and training

In [None]:
import os
import numpy as np
from PIL import Image # Used for opening the TIFF file
import random
import cv2 # Used for saving the colorized PNG
import sys # Used for printing errors to stderr

# --- Function to colorize a segmentation map ---
def colorize_seg_map(seg_map: np.ndarray, palette: dict = None) -> np.ndarray:
    """
    Colorizes a segmentation map using random RGB colors for unique IDs.

    Args:
        seg_map: A 2D NumPy array representing the segmentation map, where each
                 unique integer value corresponds to a segment ID.
        palette: An optional dictionary mapping segment IDs to RGB color tuples
                 (e.g., {1: (255, 0, 0), 2: (0, 255, 0)}). If None, a random
                 palette is generated for the unique IDs present in the map.

    Returns:
        A 3D NumPy array (height, width, 3) representing the colorized segmentation map
        in RGB format (uint8). Background (ID 0) is left black.
    """
    # Ensure the segmentation map is a NumPy array
    if not isinstance(seg_map, np.ndarray):
        seg_map = np.array(seg_map)

    # Create an empty RGB image of the same shape as the segmentation map
    colorful_seg_map = np.zeros((*seg_map.shape, 3), dtype=np.uint8)

    # Get the unique segment IDs present in the map
    unique_ids = np.unique(seg_map)

    # Generate a palette if none is provided
    if palette is None:
        palette = {}
        # Generate a random color for each unique ID (excluding background 0)
        for seg_id in unique_ids:
            if seg_id == 0:
                continue # Skip background
            # Generate random integer colors between 0 and 255
            color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
            palette[seg_id] = color
    else:
        # Validate provided palette contains colors for unique IDs (optional but good practice)
        # For simplicity, we'll assume the provided palette is valid for now.
        pass # Add validation logic here if needed

    # Apply colors to the segmentation map
    for seg_id, color in palette.items():
        # Ensure we only color IDs present in the map and not the background (ID 0)
        if seg_id != 0 and seg_id in unique_ids:
             # Use boolean indexing to assign the color to all pixels with this segment ID
            colorful_seg_map[seg_map == seg_id, :] = color

    return colorful_seg_map

# --- Function to process a single TIFF segmentation file ---
def process_segmentation_file(input_filepath: str, output_base_path: str) -> bool:
    """
    Converts a TIFF segmentation mask to a NumPy array (.npy) and
    a colorized PNG image (.png).

    Args:
        input_filepath: Path to the input .tif segmentation file.
        output_base_path: Base path for saving the output files (without extension).
                          The .npy and .png extensions will be added automatically.

    Returns:
        True if processing was successful, False otherwise.
    """
    try:
        # Open the .tif file using Pillow
        with Image.open(input_filepath) as img:
            # Convert image to a NumPy array
            # Ensure dtype is appropriate for segmentation IDs (e.g., uint16 or int32 if IDs are large)
            # For typical masks, uint8 might suffice, but uint16 is safer.
            img_array = np.array(img, dtype=np.uint16) # Use uint16 for potentially larger IDs

            # Generate colorized mask
            color_mask = colorize_seg_map(img_array)

            # Define output file paths
            output_npy_path = f"{output_base_path}.npy"
            output_png_path = f"{output_base_path}.png"

            # Save the NumPy array as a .npy file
            np.save(output_npy_path, img_array)
            # Save the colorized mask as a .png file using OpenCV
            # OpenCV saves in BGR format, so we need to convert RGB to BGR
            cv2.imwrite(output_png_path, cv2.cvtColor(color_mask, cv2.COLOR_RGB2BGR))

            # print(f"Processed {os.path.basename(input_filepath)}") # Optional: print success per file
        return True
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_filepath}", file=sys.stderr)
        return False
    except Exception as e:
        print(f"An error occurred processing {os.path.basename(input_filepath)}: {e}", file=sys.stderr)
        return False

# --- Function to process all segmentation files in a directory ---
def process_segmentation_directory(input_dir: str, output_dir: str):
    """
    Processes all .tif segmentation mask files in an input directory,
    converting each to a .npy array and a colorized .png image,
    and saves them in the output directory.

    Args:
        input_dir: Path to the directory containing input .tif files.
        output_dir: Path to the directory where output .npy and .png files will be saved.
    """
    print(f"Processing segmentation masks from: {input_dir}")
    print(f"Saving processed files to: {output_dir}")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # List all entries in the input directory
    entries = os.listdir(input_dir)
    print(f"Found {len(entries)} entries in {input_dir}")

    processed_count = 0
    failed_count = 0

    # Iterate through each entry
    for entry_name in entries:
        input_filepath = os.path.join(input_dir, entry_name)

        # Check if it's a file and ends with .tif (case-insensitive)
        if os.path.isfile(input_filepath) and entry_name.lower().endswith('.tif'):
            try:
                # Get the base filename without extension
                base_name = os.path.splitext(entry_name)[0]
                # Define the base path for the output files (without extension)
                output_base_path = os.path.join(output_dir, base_name)

                # Process the file
                print(f"Processing {entry_name}...") # Print before processing
                if process_segmentation_file(input_filepath, output_base_path):
                    processed_count += 1
                else:
                    failed_count += 1

            except Exception as e:
                # Catch potential errors during path manipulation or naming
                print(f"An unexpected error occurred processing entry {entry_name}: {e}", file=sys.stderr)
                failed_count += 1
        # else:
            # Optional: print messages for ignored entries (e.g., directories, non-tif files)
            # print(f"Skipping {entry_name} (not a .tif file or not a file)")


    print("-" * 40)
    print(f"Processing complete for {input_dir}")
    print(f"Successfully processed: {processed_count}")
    print(f"Failed to process: {failed_count}")
    print("-" * 40)


# --- Main Execution Block (suitable for a notebook cell) ---

# Define the base paths for input and output
# Assuming the structure is like ./data/DSB2018/original/test/masks/*.tif
base_input_path = "./data/DSB2018/original"
base_output_path = "./data/DSB2018/visual" # This seems to be where you want the output

# Define the splits to process (e.g., 'test', 'train')
# Based on your original code, it seems you were processing the 'test' split.
# You can add 'train' if needed.
splits = ['train', 'test', 'val'] # Add 'train' here if you want to process train masks too

# Process each split
for split in splits:
    # Construct the full input directory path for the masks
    # Assuming masks are in a 'masks' subdirectory within the split
    input_masks_dir = os.path.join(base_input_path, split, 'masks') # Assuming a 'masks' subdir
    # Construct the full output directory path
    output_processed_dir = os.path.join(base_output_path, split, 'masks') # Saving processed masks here

    # Check if the input directory exists before processing
    if os.path.isdir(input_masks_dir):
        process_segmentation_directory(input_masks_dir, output_processed_dir)
    else:
        print(f"\n--- Skipping {split} split ---")
        print(f"Input masks directory not found: {input_masks_dir}")
        print("-" * 40)

print("\nAll specified splits processed.")



### 3. Splitting images and masks to 256*256 (HoverNet training format)

In [None]:
import os
import numpy as np
from PIL import Image
import sys # Used for printing errors to stderr

# --- Function to split and save patches for a single image and mask pair ---
def split_and_save_patches(
    input_image_path: str,
    input_mask_path: str,
    output_split_dir: str, # Directory to save all patches for this split
    image_base_name: str, # Base name of the original image (e.g., '1')
    patch_size: int = 256,
    step: int = 256
) -> int:
    """
    Split an image and its corresponding mask into patches and save each
    combined patch (image + mask channel) as a separate .npy file
    in the specified output directory for the split.

    Args:
        input_image_path: Path to the input .png image file.
        input_mask_path: Path to the input .npy mask file.
        output_split_dir: The single directory where all patches for this split
                          will be saved. This directory is assumed to exist
                          (created by the calling function).
        image_base_name: The base filename of the original image (without extension),
                         used for naming the output patches.
        patch_size: The height and width of each square patch (in pixels).
        step: The step size (in pixels) for the sliding window. Use step < patch_size
              for overlapping patches, and step = patch_size for non-overlapping patches.

    Returns:
        The number of patches successfully saved for this image/mask pair,
        or -1 if an error occurred.
    """
    print(f"Processing image: {os.path.basename(input_image_path)}")
    print(f"Using mask: {os.path.basename(input_mask_path)}")
    print(f"Saving patches to: {output_split_dir}")

    try:
        # Load the image and mask
        # Use convert("RGB") to ensure 3 channels, even if input is grayscale PNG
        image = np.array(Image.open(input_image_path).convert("RGB"))
        mask = np.load(input_mask_path)

        # Get dimensions of image and mask
        img_h, img_w, img_c = image.shape
        mask_h, mask_w = mask.shape

        # Ensure the dimensions of image and mask match
        if (img_h, img_w) != (mask_h, mask_w):
            raise ValueError(
                f"Dimensions mismatch: Image is {img_h}x{img_w}, Mask is {mask_h}x{mask_w}"
            )

        # Basic check for patch size validity
        if patch_size <= 0 or step <= 0:
             raise ValueError("Patch size and step must be positive integers.")
        if patch_size > img_h or patch_size > img_w:
             print(f"Warning: Patch size ({patch_size}) is larger than image dimensions ({img_h}x{img_w}). No patches will be generated for {image_base_name}.", file=sys.stderr)
             return 0 # No patches can be generated

        # Split the image and mask into patches using a sliding window
        image_patch_count = 0 # Counter for patches from this specific image
        # Calculate the range for the sliding window, ensuring the patch fits entirely
        for i in range(0, img_h - patch_size + 1, step):
            for j in range(0, img_w - patch_size + 1, step):
                # Extract the image patch (all channels)
                img_patch = image[i : i + patch_size, j : j + patch_size, :]
                # Extract the mask patch
                mask_patch = mask[i : i + patch_size, j : j + patch_size]

                # Combine the image patch (3 channels) and mask patch (1 channel)
                # Use np.expand_dims or [..., np.newaxis] to add a channel dimension to the mask
                combined_patch = np.concatenate(
                    (img_patch, np.expand_dims(mask_patch, axis=-1)), axis=-1
                )

                # Save the combined patch as a .npy file
                # Name the patch file using original image base name and sequential patch count
                patch_filename = os.path.join(
                    output_split_dir, f"{image_base_name}_{image_patch_count}.npy"
                )
                np.save(patch_filename, combined_patch)
                image_patch_count += 1

        print(f"Successfully saved {image_patch_count} patches for {os.path.basename(input_image_path)}")
        return image_patch_count

    except FileNotFoundError as e:
        print(f"Error: Input file not found: {e}", file=sys.stderr)
        return -1
    except ValueError as e:
        print(f"Data error for {os.path.basename(input_image_path)}: {e}", file=sys.stderr)
        return -1
    except Exception as e:
        print(f"An unexpected error occurred processing {os.path.basename(input_image_path)}: {e}", file=sys.stderr)
        return -1

# --- Function to process all images and masks in specified directories ---
def process_split_for_patching(
    split_name: str,
    base_input_image_dir: str,
    base_input_mask_dir: str,
    base_output_patch_dir: str, # Base directory where output split folders will be created
    patch_size: int = 256,
    step: int = 256
):
    """
    Processes all image/mask pairs within a specific data split (e.g., 'test')
    and saves all their patches into a single output directory for that split.

    Args:
        split_name: The name of the data split (e.g., 'test').
        base_input_image_dir: Base directory containing image subdirectories for splits.
                              Expected structure: base_input_image_dir / split_name / ...
        base_input_mask_dir: Base directory containing mask subdirectories for splits.
                             Expected structure: base_input_mask_dir / split_name / ...
        base_output_patch_dir: Base directory where the single output directory for this
                                split's patches will be created.
                                Expected structure: base_output_patch_dir / split_name / ...
        patch_size: Size of the patch.
        step: Step size for sliding window.
    """
    print(f"\n--- Processing split: {split_name} ---")

    # Construct the full input directories for images and masks for this split
    input_images_dir = os.path.join(base_input_image_dir, split_name, 'images')
    input_masks_dir = os.path.join(base_input_mask_dir, split_name, 'masks')

    # Construct the single output directory for ALL patches of this split
    output_split_dir = os.path.join(base_output_patch_dir, split_name)

    # Check if input directories exist
    if not os.path.isdir(input_images_dir):
        print(f"Skipping split '{split_name}': Image input directory not found at {input_images_dir}", file=sys.stderr)
        return
    if not os.path.isdir(input_masks_dir):
        print(f"Skipping split '{split_name}': Mask input directory not found at {input_masks_dir}", file=sys.stderr)
        return

    # Create the single output directory for this split's patches if it doesn't exist
    os.makedirs(output_split_dir, exist_ok=True)

    # List all files in the input image directory
    # Filter for .png files (case-insensitive)
    image_files = [f for f in os.listdir(input_images_dir) if os.path.isfile(os.path.join(input_images_dir, f)) and f.lower().endswith('.png')]
    print(f"Found {len(image_files)} image files in {input_images_dir}")

    total_patches_saved = 0
    processed_files_count = 0
    failed_files_count = 0

    # Iterate through each image file
    for image_filename in image_files:
        # Get the base filename without extension
        base_name = os.path.splitext(image_filename)[0]

        # Construct the full paths for the image and corresponding mask
        input_image_path = os.path.join(input_images_dir, image_filename)
        input_mask_path = os.path.join(input_masks_dir, f"{base_name}.npy") # Assuming mask has same base name but .npy

        # Check if the corresponding mask file exists
        if not os.path.exists(input_mask_path):
            print(f"Skipping image {image_filename}: Corresponding mask not found at {input_mask_path}", file=sys.stderr)
            failed_files_count += 1
            continue # Skip to the next image

        # Call the patch splitting function for this image/mask pair
        # Pass the single output directory for the split and the image base name
        num_patches = split_and_save_patches(
            input_image_path,
            input_mask_path,
            output_split_dir, # Pass the single output directory
            base_name,        # Pass the image base name
            patch_size,
            step
        )

        if num_patches >= 0: # Success or 0 patches generated
            total_patches_saved += num_patches
            processed_files_count += 1
        else: # Error occurred
            failed_files_count += 1

    print("-" * 50)
    print(f"Finished processing split: {split_name}")
    print(f"Total image/mask pairs processed: {processed_files_count}")
    print(f"Total image/mask pairs failed: {failed_files_count}")
    print(f"Total patches saved across all processed pairs: {total_patches_saved}")
    print("-" * 50)


# --- Main Execution Block (suitable for a notebook cell) ---

# Define the base paths for input images, input masks, and output patches
# Adjust these paths based on your actual data structure
base_input_image_dir = "./data/DSB2018/visual" # Assuming images are in ./data/DSB2018/visual/[split]/...
base_input_mask_dir = "./data/DSB2018/visual"  # Assuming masks are in ./data/DSB2018/visual/[split]/...
base_output_patch_dir = "./data/DSB2018/data256/hovernet" # Where the new split folders containing ALL patches will be created

# Define the splits to process
splits_to_process = ['train', 'test', 'val'] # Process all desired splits

# Define patch size and step
patch_size = 256
step = 256 # Use step < patch_size for overlapping patches

# Process each specified split
for split in splits_to_process:
    process_split_for_patching(
        split,
        base_input_image_dir,
        base_input_mask_dir,
        base_output_patch_dir,
        patch_size,
        step
    )

print("\nPatch splitting process complete for all specified splits.")


### 4. Generating the training dataset from above .npy files

In [None]:
import numpy as np
from PIL import Image
from tqdm import tqdm
import os
import cv2

def process_single_npy(npy_filepath, output_img_dir, output_mask_dir, output_inst_dir):
    """
    Convert a single .npy file (image + mask/instance) to image and mask PNGs,
    and save the instance data as .npy.

    Args:
        npy_filepath (str): Full path to the input .npy file.
        output_img_dir (str): Directory to save the image PNG.
        output_mask_dir (str): Directory to save the mask PNG.
        output_inst_dir (str): Directory to save the instance .npy file.
                               This should already include the split subdirectory.
    """
    try:
        # Load the .npy file
        data = np.load(npy_filepath)

        # Ensure data has expected dimensions (height, width, channels)
        if data.ndim != 3 or data.shape[-1] < 4:
             print(f"Warning: Skipping file {npy_filepath} - expected 3 dimensions with at least 4 channels, but got {data.shape}")
             return

        # Separate image data (first 3 channels)
        image_data = data[..., :3]
        # Assuming image data is in 0-255 range or can be cast to uint8 directly
        image_data = image_data.astype(np.uint8)

        # Separate mask/instance data (4th channel)
        instance_data = data[..., 3] # Keep original instance IDs

        # Create binary mask (foreground > 0)
        mask_data = instance_data.copy() # Use the copied instance data
        mask_data[mask_data > 0] = 1
        mask_data = mask_data.astype(np.uint8) # Mask should be 0 or 1

         # Handle potential grayscale masks if data[..., 3] was not 2D but 3D with channel 1
        if mask_data.ndim == 3 and mask_data.shape[-1] == 1:
             mask_data = np.squeeze(mask_data, axis=-1) # Remove single channel dimension


        # Convert image and binary mask arrays to PIL Images
        image = Image.fromarray(image_data)
        mask = Image.fromarray(mask_data)

        # Generate output filenames
        # Use os.path.basename to get just the filename from the full path
        npy_filename = os.path.basename(npy_filepath)
        base_name = os.path.splitext(npy_filename)[0] # Get filename without extension

        image_name = f"{base_name}.png"
        mask_name_png = f"{base_name}.png" # Mask will also be a PNG
        instance_name_npy = f"{base_name}.npy" # Instance data stays NPY

        output_img_path = os.path.join(output_img_dir, image_name)
        output_mask_path_png = os.path.join(output_mask_dir, mask_name_png)
        output_instance_path_npy = os.path.join(output_inst_dir, instance_name_npy)

        # Save the image and mask
        image.save(output_img_path)
        mask.save(output_mask_path_png)

        # Save the original instance data as .npy
        np.save(output_instance_path_npy, instance_data)

    except FileNotFoundError:
        print(f"Error: Input file not found at {npy_filepath}")
    except Exception as e:
        print(f"An error occurred while processing {npy_filepath}: {e}")


if __name__ == "__main__":
    splits = ['train', 'test', 'val']

    # Define base paths *without* the split name
    source_base_path = "./data/DSB2018/data256/hovernet"
    out_img_base_dir = "./data/DSB2018/data256/mmseg/images"
    out_mask_base_dir = "./data/DSB2018/data256/mmseg/masks"
    out_inst_base_dir = "./data/DSB2018/data256/mmseg/insts"

    # Loop through each split
    for split in splits:
        # Construct the full source path for the current split
        source_split_path = os.path.join(source_base_path, split)

        # Construct the full output directory paths for the current split
        output_img_split_dir = os.path.join(out_img_base_dir, split)
        output_mask_split_dir = os.path.join(out_mask_base_dir, split)
        output_inst_split_dir = os.path.join(out_inst_base_dir, split)

        # Create the output directories for this split if they don't exist
        os.makedirs(output_img_split_dir, exist_ok=True)
        os.makedirs(output_mask_split_dir, exist_ok=True)
        os.makedirs(output_inst_split_dir, exist_ok=True)

        # Check if the source split folder exists
        if not os.path.exists(source_split_path):
            print(f"Warning: Source directory for split '{split}' not found: {source_split_path}. Skipping.")
            continue

        print(f"Processing split: {split}")

        # List files in the current split directory
        try:
            split_file_list = os.listdir(source_split_path)
        except Exception as e:
             print(f"Error listing files in {source_split_path}: {e}. Skipping split.")
             continue

        # Filter for .npy files to avoid processing directories or other files
        npy_files_in_split = [f for f in split_file_list if f.endswith('.npy')]

        if not npy_files_in_split:
             print(f"No .npy files found in {source_split_path}. Skipping split.")
             continue


        # Process each .npy file in the current split with a progress bar
        for filename in tqdm(npy_files_in_split, desc=f"Processing {split}"):
            full_npy_path = os.path.join(source_split_path, filename)

            # Call the processing function with full input path and split-specific output directories
            process_single_npy(full_npy_path, output_img_split_dir, output_mask_split_dir, output_inst_split_dir)

    print("Processing complete.")

Processing split: train


Processing train: 100%|██████████| 602/602 [00:09<00:00, 64.46it/s]


Processing split: test


Processing test: 100%|██████████| 89/89 [00:01<00:00, 60.24it/s]


Processing split: val


Processing val: 100%|██████████| 109/109 [00:01<00:00, 65.61it/s]

Processing complete.





### 5. Generating the file list with .txt format

In [14]:
import os

def generate_file_list(directory, output_txt):
    """
    Generate a text file containing the file names (without extensions)
    of all files in a directory.

    Args:
        directory (str): Path to the directory containing the files.
        output_txt (str): Path to the output text file.
    """
    # Ensure the output directory exists before writing the file
    output_dir = os.path.dirname(output_txt)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    try:
        # Check if the source directory exists
        if not os.path.exists(directory):
            print(f"Warning: Source directory not found: {directory}. Skipping.")
            return

        # List all entries in the directory
        entries = os.listdir(directory)

        # Filter out directories and get only files
        files = [entry for entry in entries if os.path.isfile(os.path.join(directory, entry))]

        # Extract file names without extensions
        file_names = [os.path.splitext(file)[0] for file in files]

        # Write file names to the output text file
        with open(output_txt, 'w') as f:
            for name in file_names:
                f.write(name + '\n')

        print(f"File list saved successfully to {output_txt}")

    except Exception as e:
        print(f"An error occurred while processing {directory}: {e}")

if __name__ == "__main__":
    # Define the splits
    splits = ['train', 'test', 'val']

    # Define the base directory where the image folders for splits are located
    base_image_directory = "./data/DSB2018/data256/mmseg/images"

    # Define the base directory where the output text files should be saved
    base_output_directory = "./data/DSB2018/data256/mmseg"

    # Loop through each split and generate the corresponding text file
    for split in splits:
        # Construct the full path to the image directory for the current split
        image_directory_split = os.path.join(base_image_directory, split)

        # Construct the full path for the output text file for the current split
        output_file_split = os.path.join(base_output_directory, f"{split}.txt")

        print(f"Generating file list for split: {split}")

        # Call the function to generate the file list for the current split
        generate_file_list(image_directory_split, output_file_split)

    print("All file lists generated.")

Generating file list for split: train
File list saved successfully to ./data/DSB2018/data256/mmseg/train.txt
Generating file list for split: test
File list saved successfully to ./data/DSB2018/data256/mmseg/test.txt
Generating file list for split: val
File list saved successfully to ./data/DSB2018/data256/mmseg/val.txt
All file lists generated.


### 6. Generating four-color encoding .png and adjacency .yaml files from Inst file

In [None]:
import numpy as np
from scipy.ndimage import convolve
import yaml
from tqdm import tqdm
import os


def compute_adjacency_map(instance_mask):
    """
    计算细胞核实例的邻接关系图。
    
    Args:
        instance_mask (np.Tensor): 细胞核实例编码图，形状为 (H, W)。
    
    Returns:
        adjacency_map (dict): 邻接关系图，键是实例ID，值是相邻实例的ID列表。
    """
    instance_ids = np.unique(instance_mask)
    adjacency_map = {int(i): set() for i in instance_ids if i != 0}

    # 定义邻接核 (8邻域)
    kernel = np.ones((3, 3), dtype=np.int32)
    
    for instance_id in adjacency_map.keys():
        # 创建当前实例的二值掩码
        binary_mask = (instance_mask == instance_id).astype(np.int32)
        
        # 使用卷积判断边界是否接触其他实例
        boundary = convolve(binary_mask, kernel, mode='constant', cval=0)
        neighbors = np.unique(instance_mask * (boundary > 0))
        
        # 记录相邻的实例ID，排除自己和背景
        adjacency_map[instance_id].update(
            int(neighbor_id) for neighbor_id in neighbors if neighbor_id != instance_id and neighbor_id != 0
        )

    return adjacency_map

def save_adjacency_map_to_yaml(adjacency_map, file_path):
    """
    将邻接关系字典保存为 YAML 文件。
    
    Args:
        adjacency_map (dict): 邻接关系字典。
        file_path (str): 输出的 YAML 文件路径。
    """
    # 转换 set 为 list，确保 YAML 可序列化
    adjacency_map_serializable = {int(key): list(value) for key, value in adjacency_map.items()}
    
    # 保存为 YAML 文件
    with open(file_path, 'w') as f:
        yaml.dump(adjacency_map_serializable, f, default_flow_style=False, sort_keys=False)



stage = "test"
inst_path = "./data/DSB2018/mmseg/inst"
save_path = "./data/DSB2018/mmseg/adjacency"


for f in tqdm(os.listdir(f"{inst_path}/{stage}")):
    if f.endswith(".npy"):
        f_name = f.split(".")[0]
        mask = np.load(f"{inst_path}/{stage}/{f}")
        adjacency_map = compute_adjacency_map(mask)
        # 保存为 YAML 文件
        output_file = f"{save_path}/{stage}/{f_name}.yaml"
        save_adjacency_map_to_yaml(adjacency_map, output_file)

    # print(f"邻接关系字典已保存到 {output_file}")