In [88]:
from bioio import BioImage
from bioio_ome_zarr.writers import Channel, OMEZarrWriter
import dask.array as da
from zarr.codecs import BloscCodec
import numpy as np
import re
from typing import Dict, List, Union
from collections import defaultdict
from Group_Files import group_ome_tiff_by_region, print_group_summary,extract_channel_marker_info,print_channel_marker_info
import tempfile
import os
from ome_types import from_tiff, to_xml, from_xml
from ome_types.model import Channel
from ome_types.model import Image
from ome_types.model import OME
from ome_types.model import Pixels
from ome_types.model import TiffData
import uuid

## Testing file grouping

In [None]:
"""
Function to group OME-TIFF image files by region identifier.

This module provides functionality to parse and group image files following
the naming convention:
prefix_mmddyyyy_S#_[1-15].0.4_R###_(channel)_(marker)_FINAL_suffix.ome.tif
"""

def group_ome_tiff_by_region(
    directory: Union[str, Path],
    return_type: str = "dict"
) -> Union[Dict[str, List[str]], List[List[str]]]:
    """
    Group OME-TIFF files by their region identifier (R###).
    
    This function searches for files matching the specified naming convention
    and groups them by their region identifier. Only files containing 'FINAL'
    in their name are included.
    
    Parameters
    ----------
    directory : str or Path
        Path to the directory containing the OME-TIFF files
    return_type : str, optional
        Format of the return value. Either "dict" (default) or "list"
        - "dict": Returns a dictionary with region IDs as keys
        - "list": Returns a list of lists, one per region
    
    Returns
    -------
    dict or list
        If return_type="dict": Dictionary with region IDs (e.g., "R001") as keys
            and lists of file paths as values
        If return_type="list": List of lists, where each inner list contains
            file paths for one region
    
    Examples
    --------
    >>> # Get groups as dictionary
    >>> groups = group_ome_tiff_by_region("/path/to/images", return_type="dict")
    >>> print(groups.keys())
    dict_keys(['R000', 'R001', 'R002', ...])
    
    >>> # Get groups as list of lists
    >>> groups = group_ome_tiff_by_region("/path/to/images", return_type="list")
    >>> print(f"Found {len(groups)} regions")
    Found 8 regions
    
    Notes
    -----
    The regex pattern matches files with this structure:
    - prefix: any characters (typically initials)
    - date: mmddyyyy format
    - sample: S followed by 1-2 digits (S1-S15)
    - round: integer 1-15, followed by .0.4
    - region: R followed by 3 digits (R000-R999)
    - channel: DAPI, Cy3, Cy5, FITC, or Cy7
    - marker: alphanumeric name
    - must contain 'FINAL'
    - suffix: AFR_F or _F
    - extension: .ome.tif
    """
    
    # Convert to Path object for easier handling
    directory = Path(directory)
    
    if not directory.exists():
        raise ValueError(f"Directory does not exist: {directory}")
    
    if not directory.is_dir():
        raise ValueError(f"Path is not a directory: {directory}")
    
    # Regex pattern to match the file naming convention
    # Pattern breakdown:
    # ^(.+?)_ : prefix (non-greedy) followed by underscore
    # (\d{8})_ : date in mmddyyyy format
    # (S\d{1,2})_ : sample ID (S1 to S15)
    # (\d{1,2}\.0\.\d+)_ : round number (1-15.0.4)
    # (R\d{3})_ : region identifier (R###)
    # (DAPI|Cy3|Cy5|FITC|Cy7)_ : channel name
    # ([A-Za-z0-9_]+)_ : marker name
    # .*FINAL.* : must contain FINAL
    # (AFR_F|_F) : suffix
    # \.ome\.tif$ : file extension
    
    pattern = re.compile(
        r'^(.+?)_'           # prefix
        r'(\d{8})_'          # date (mmddyyyy)
        r'(S\d{1,2})_'       # sample ID (S1-S15)
        r'(\d{1,2}\.\d+\.\d+)_'  # round (e.g., 1.0.1, 1.0.4, 15.0.4)
        r'(R\d{3})_'         # region (R###)
        r'(DAPI|Cy3|Cy5|FITC|Cy7|FITC)_'  # channel name
        r'(.+?)_'            # marker name (non-greedy)
        r'.*FINAL.*'         # must contain FINAL
        r'(AFR_F|_F)'        # suffix
        r'\.ome\.tif$',      # extension
        re.IGNORECASE
    )
    
    # Dictionary to store grouped files
    region_groups = defaultdict(list)
    
    # Iterate through all files in the directory
    for file_path in directory.iterdir():
        # Skip if not a file
        if not file_path.is_file():
            continue
        
        filename = file_path.name
        
        # Check if filename matches the pattern
        match = pattern.match(filename)
        
        if match:
            # Extract the region identifier (5th capture group)
            region_id = match.group(5)
            
            # Add the full file path to the corresponding region group
            region_groups[region_id].append(str(file_path))
    
    # Sort files within each group for consistent ordering
    for region_id in region_groups:
        region_groups[region_id].sort()
    
    # Return based on requested format
    if return_type == "dict":
        # Return as regular dict (sorted by region ID)
        return dict(sorted(region_groups.items()))
    elif return_type == "list":
        # Return as list of lists (sorted by region ID)
        return [region_groups[region_id] for region_id in sorted(region_groups.keys())]
    else:
        raise ValueError(f"Invalid return_type: {return_type}. Must be 'dict' or 'list'")


def print_group_summary(groups: Union[Dict[str, List[str]], List[List[str]]]) -> None:
    """
    Print a summary of the grouped files.
    
    Parameters
    ----------
    groups : dict or list
        The output from group_ome_tiff_by_region()
    """
    if isinstance(groups, dict):
        print(f"Found {len(groups)} region groups:")
        for region_id, files in groups.items():
            print(f"\n{region_id}: {len(files)} files")
            for file_path in files:
                print(f"  - {Path(file_path).name}")
    elif isinstance(groups, list):
        print(f"Found {len(groups)} region groups:")
        for i, files in enumerate(groups):
            print(f"\nGroup {i+1}: {len(files)} files")
            for file_path in files:
                print(f"  - {Path(file_path).name}")


# Example usage
if __name__ == "__main__":
    import sys
    
    # Example 1: Get groups as dictionary
    if len(sys.argv) > 1:
        directory = sys.argv[1]
    else:
        directory = "/mnt/user-data/uploads"  # Default to uploaded files
    
    print(f"Searching for OME-TIFF files in: {directory}\n")
    
    try:
        # Get groups as dictionary
        groups_dict = group_ome_tiff_by_region(directory, return_type="dict")
        print_group_summary(groups_dict)
        
        print("\n" + "="*60)
        print("\nExample: Accessing files from a specific region:")
        if groups_dict:
            first_region = list(groups_dict.keys())[0]
            print(f"\nFiles in region {first_region}:")
            for file_path in groups_dict[first_region]:
                print(f"  {Path(file_path).name}")
        
        print("\n" + "="*60)
        print("\nAlternative: Get groups as list of lists:")
        groups_list = group_ome_tiff_by_region(directory, return_type="list")
        print(f"Total number of region groups: {len(groups_list)}")
        
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

In [None]:
"""
Test script to demonstrate the grouping function with example filenames
from the uploaded image.
"""

# Create example filenames based on the uploaded image
example_filenames = [
    "KK_10082025_S2_1.0.1_R000_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R000_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R001_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R001_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R002_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R002_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R003_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R003_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R004_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R004_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R005_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R005_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R006_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R006_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.1_R007_20X_VHE_F.tif",
    "KK_10082025_S2_1.0.1_R007_DAPI_AF_F.ome.tif",
    "KK_10082025_S2_1.0.4_R000_Cy3_iba1_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R000_Cy5_Neun_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R000_DAPI_FINAL_F.ome.tif",
    "KK_10082025_S2_1.0.4_R000_FITC_GFAP_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R001_Cy3_iba1_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R001_Cy5_Neun_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R001_DAPI_FINAL_F.ome.tif",
    "KK_10082025_S2_1.0.4_R001_FITC_GFAP_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R002_Cy3_iba1_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R002_Cy5_Neun_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R002_DAPI_FINAL_F.ome.tif",
    "KK_10082025_S2_1.0.4_R002_FITC_GFAP_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R003_Cy3_iba1_FINAL_AFR_F.ome.tif",
    "KK_10082025_S2_1.0.4_R003_Cy5_Neun_FINAL_AFR_F.ome.tif",
]

# Create a temporary directory and create empty files
with tempfile.TemporaryDirectory() as tmpdir:
    print(f"Creating test files in: {tmpdir}\n")
    
    # Create the test files
    for filename in example_filenames:
        filepath = Path(tmpdir) / filename
        filepath.touch()
    
    print(f"Created {len(example_filenames)} test files")
    print("="*70)
    
    # Test the grouping function with dictionary return type
    print("\n### TEST 1: Dictionary Return Type ###\n")
    groups_dict = group_ome_tiff_by_region(tmpdir, return_type="dict")
    print_group_summary(groups_dict)
    
    # Show statistics
    print("\n" + "="*70)
    print("\n### Statistics ###")
    print(f"Total regions found: {len(groups_dict)}")
    for region_id, files in groups_dict.items():
        print(f"  {region_id}: {len(files)} files")
    
    # Test with list return type
    print("\n" + "="*70)
    print("\n### TEST 2: List Return Type ###\n")
    groups_list = group_ome_tiff_by_region(tmpdir, return_type="list")
    print(f"Number of region groups: {len(groups_list)}")
    for i, group in enumerate(groups_list):
        print(f"\nGroup {i+1}: {len(group)} files")
        for filepath in group[:3]:  # Show first 3 files
            print(f"  - {Path(filepath).name}")
        if len(group) > 3:
            print(f"  ... and {len(group) - 3} more files")
    
    # Example: Process each group
    print("\n" + "="*70)
    print("\n### Example: Processing Each Group ###\n")
    for region_id, file_list in groups_dict.items():
        print(f"Processing region {region_id}:")
        print(f"  - Found {len(file_list)} files")
        
        # Example: Extract channel information
        channels = set()
        for filepath in file_list:
            filename = Path(filepath).name
            if "DAPI" in filename:
                channels.add("DAPI")
            elif "Cy3" in filename:
                channels.add("Cy3")
            elif "Cy5" in filename:
                channels.add("Cy5")
            elif "FITC" in filename:
                channels.add("FITC")
            elif "Cy7" in filename:
                channels.add("Cy7")
        
        print(f"  - Channels present: {sorted(channels)}")
        print()

## Testing reading in groups and merging into one array

In [2]:
test_dir = r'E:\Cores\CellDIVE_ImageMerging\Testing_CellDVIE_Input\Trial2_10102025'
groups_dict = group_ome_tiff_by_region(test_dir)
print_group_summary(groups_dict)
channel_dict = extract_channel_marker_info(test_dir)
print_channel_marker_info(channel_dict)

Found 1 region groups:

R000: 6 files
  - CG_10092025_S1_1.0.4_R000_Cy3_sLeX-AF594_FINAL_AFR_F.ome.tif
  - CG_10092025_S1_1.0.4_R000_Cy5_ECad-AF647_FINAL_AFR_F.ome.tif
  - CG_10092025_S1_1.0.4_R000_DAPI__FINAL_F.ome.tif
  - CG_10092025_S1_1.0.4_R000_FITC_VVL-488_FINAL_AFR_F.ome.tif
  - CG_10092025_S1_2.0.4_R000_Cy5_VIM-AF647_FINAL_AFR_F.ome.tif
  - CG_10092025_S1_2.0.4_R000_FITC_AAL-FITC_FINAL_AFR_F.ome.tif
Found channel/marker info for 1 regions:

R000: 6 channels
  - 1.0.4_Cy3_sLeX-AF594
  - 1.0.4_Cy5_ECad-AF647
  - 1.0.4_DAPI
  - 1.0.4_FITC_VVL-488
  - 2.0.4_Cy5_VIM-AF647
  - 2.0.4_FITC_AAL-FITC


In [None]:
#loop set up for reading through multiple regions
for region_id, files in groups_dict.items():

In [None]:
print(groups_dict)

In [34]:
files = groups_dict['R000']
channels = channel_dict['R000']

In [35]:
channels_dic = [Channel(label=f"{i}",color="FF0000") for i in channels]

In [None]:
test_file = files[0]
test_img = BioImage(test_file)

In [93]:
ome_obj = from_tiff(r"E:\Cores\CellDIVE_ImageMerging\Testing_CellDVIE_Input\Trial2_10102025\CG_10092025_S1_1.0.4_R000_Cy3_sLeX-AF594_FINAL_AFR_F.ome.tif")

XMLSyntaxError: xmlns:schemaLocation: 'http://www.openmicroscopy.org/Schemas/OME/2016-06 http://www.openmicroscopy.org/Schemas/OME/2016-06/ome.xsd' is not a valid URI, line 2, column 136 (<string>, line 2)

In [92]:
ome_obj.images

[Image(
    id='Image:0',
    pixels={'channels': [{'id': 'Channel:0', 'name': 'DAPI', 'color': Color('red', rgb=(255, 0, 0))}, {'id': 'Channel:1', 'name': 'GFAP', 'color': Color('lime', rgb=(0, 255, 0))}], 'id': 'Pixels:0', 'dimension_order': <Pixels_DimensionOrder.XYZCT: 'XYZCT'>, 'type': <PixelType.UINT16: 'uint16'>, 'size_x': 62826, 'size_y': 47973, 'size_z': 1, 'size_c': 2, 'size_t': 1, 'physical_size_x': 0.3, 'physical_size_y': 0.3},
 )]

In [38]:
test_channel = [Channel(label=channels[0],color="FF0000")]

In [71]:
test_two_files = files[0:2]
test_two_channels = channels[0:2]
test_two_channels_dic = [Channel(label=f"{i}",color="FF0000") for i in test_two_channels]

In [59]:
test_two_images = list(map(BioImage,test_two_files))

In [66]:
test_two_images_data = [img.get_image_data('YX') for img in test_two_images]

In [67]:
test_two_images_data_stack = np.stack(test_two_images_data, axis=0)
test_two_images_data_stack.shape

(2, 47973, 62826)

In [81]:
pixel_size = [0,test_img.physical_pixel_sizes.Y,test_img.physical_pixel_sizes.X]
store_path = r"E:\Cores\CellDIVE_ImageMerging\Testing_CellDIVE_Output"

In [62]:
test_image_data = test_img.get_image_data('YX')

In [74]:
test_two_images_data_stack.dtype

dtype('uint16')

In [72]:
level_shapes = [
    (2,test_image_data.shape[0],test_image_data.shape[1]),
    (2,int(test_image_data.shape[0]/2),int(test_image_data.shape[1]/2)),
    (2,int(test_image_data.shape[0]/4),int(test_image_data.shape[1]/4)),
    (2,int(test_image_data.shape[0]/8),int(test_image_data.shape[1]/8)),
    (2,int(test_image_data.shape[0]/12),int(test_image_data.shape[1]/12)),
]

In [73]:
print(level_shapes)

[(2, 47973, 62826), (2, 23986, 31413), (2, 11993, 15706), (2, 5996, 7853), (2, 3997, 5235)]


In [82]:
writer = OMEZarrWriter(
    store = os.path.join(store_path,"testing_pyramid_two_channels.zarr"),
    level_shapes=level_shapes,
    dtype=test_two_images_data_stack.dtype,
    zarr_format=2,
    channels=test_two_channels_dic,
    axes_names=["c","y","x"],
    axes_types=["channel","space","space"],
    axes_units=[None,"micrometer","micrometer"],
    physical_pixel_size=pixel_size #if extra dimensions like time, channels, etc are added in, they need to be included in the list
)

In [84]:
writer.preview_metadata()

{'multiscales': [{'axes': [{'name': 'c', 'type': 'channel'},
    {'name': 'y', 'type': 'space', 'unit': 'micrometer'},
    {'name': 'x', 'type': 'space', 'unit': 'micrometer'}],
   'datasets': [{'path': '0',
     'coordinateTransformations': [{'type': 'scale',
       'scale': [0.0, 0.325002437518281, 0.325002437518281]}]},
    {'path': '1',
     'coordinateTransformations': [{'type': 'scale',
       'scale': [0.0, 0.6500184247087675, 0.650004875036562]}]},
    {'path': '2',
     'coordinateTransformations': [{'type': 'scale',
       'scale': [0.0, 1.300036849417535, 1.300051135841304]}]},
    {'path': '3',
     'coordinateTransformations': [{'type': 'scale',
       'scale': [0.0, 2.6002905161882075, 2.600102271682608]}]},
    {'path': '4',
     'coordinateTransformations': [{'type': 'scale',
       'scale': [0.0, 3.9007610545570413, 3.9004017458497655]}]}],
   'name': 'Image',
   'version': '0.4'}],
 'omero': {'id': 1,
  'name': 'Image',
  'version': '0.4',
  'channels': [{'color': 'FF

In [83]:
writer.write_full_volume(test_two_images_data_stack)

  ops.append(da.to_zarr(src, self.datasets[level_index], compute=False))
