read post-processing from hdf5 - var statistics and microscopy data

virtualcell · May 2, 2024 · 4c18146 · 4c18146
1 parent b7e1ccc
commit 4c18146
Show file tree

Hide file tree

Showing 5 changed files with 449 additions and 30 deletions.
diff --git a/pythonData/example.ipynb b/pythonData/example.ipynb
diff --git a/pythonData/poetry.lock b/pythonData/poetry.lock
diff --git a/pythonData/pyproject.toml b/pythonData/pyproject.toml
@@ -14,6 +14,7 @@ numexpr = "^2.9.0"
 zarr = "^2.17.0"
 jupyter = "^1.0.0"
 matplotlib = "^3.8.3"
+h5py = "^3.10.0"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/pythonData/tests/vcell_parsing_test.py b/pythonData/tests/vcell_parsing_test.py
@@ -1,9 +1,11 @@
-import numpy as np
-from pathlib import Path
 import tarfile
+from pathlib import Path
+
+import numpy as np
 
-from vcelldata.simdata_models import PdeDataSet, DataFunctions, NamedFunction, VariableType
 from vcelldata.mesh import CartesianMesh
+from vcelldata.postprocessing import ImageMetadata, PostProcessing, VariableInfo, StatisticType
+from vcelldata.simdata_models import PdeDataSet, DataFunctions, NamedFunction, VariableType
 
 test_data_dir = (Path(__file__).parent.parent / "test_data").absolute()
 
@@ -147,3 +149,99 @@ def test_mesh_parse():
 
     mesh = CartesianMesh(mesh_file=test_data_dir / "SimID_946368938_0_.mesh")
     mesh.read()
+
+
+def test_post_processing_parse():
+    extract_simdata()
+
+    post_processing = PostProcessing(postprocessing_hdf5_path=test_data_dir / "SimID_946368938_0_.hdf5")
+    post_processing.read()
+
+    expected_times = [0.0, 0.25, 0.5, 0.75, 1.0]
+    assert np.allclose(post_processing.times, expected_times)
+
+    expected_variables = [
+        VariableInfo(stat_var_name="C_cyt_average", stat_var_unit="uM", stat_channel=0, var_index=0),
+        VariableInfo(stat_var_name="C_cyt_total", stat_var_unit="molecules", stat_channel=1, var_index=0),
+        VariableInfo(stat_var_name="C_cyt_min", stat_var_unit="uM", stat_channel=2, var_index=0),
+        VariableInfo(stat_var_name="C_cyt_max", stat_var_unit="uM", stat_channel=3, var_index=0),
+        VariableInfo(stat_var_name="Ran_cyt_average", stat_var_unit="uM", stat_channel=4, var_index=1),
+        VariableInfo(stat_var_name="Ran_cyt_total", stat_var_unit="molecules", stat_channel=5, var_index=1),
+        VariableInfo(stat_var_name="Ran_cyt_min", stat_var_unit="uM", stat_channel=6, var_index=1),
+        VariableInfo(stat_var_name="Ran_cyt_max", stat_var_unit="uM", stat_channel=7, var_index=1),
+        VariableInfo(stat_var_name="RanC_cyt_average", stat_var_unit="uM", stat_channel=8, var_index=2),
+        VariableInfo(stat_var_name="RanC_cyt_total", stat_var_unit="molecules", stat_channel=9, var_index=2),
+        VariableInfo(stat_var_name="RanC_cyt_min", stat_var_unit="uM", stat_channel=10, var_index=2),
+        VariableInfo(stat_var_name="RanC_cyt_max", stat_var_unit="uM", stat_channel=11, var_index=2),
+        VariableInfo(stat_var_name="RanC_nuc_average", stat_var_unit="uM", stat_channel=12, var_index=3),
+        VariableInfo(stat_var_name="RanC_nuc_total", stat_var_unit="molecules", stat_channel=13, var_index=3),
+        VariableInfo(stat_var_name="RanC_nuc_min", stat_var_unit="uM", stat_channel=14, var_index=3),
+        VariableInfo(stat_var_name="RanC_nuc_max", stat_var_unit="uM", stat_channel=15, var_index=3)
+    ]
+    # compare each field of the VariableInfo objects
+    for i, v in enumerate(post_processing.variables):
+        expected = expected_variables[i]
+        assert v.stat_var_name == expected.stat_var_name
+        assert v.stat_var_unit == expected.stat_var_unit
+        assert v.stat_channel == expected.stat_channel
+        assert v.var_index == expected.var_index
+
+    stats_table_from_vcell_plot = """
+    t	C_cyt_average	C_cyt_total	C_cyt_min	C_cyt_max	Ran_cyt_average	Ran_cyt_total	Ran_cyt_min	Ran_cyt_max	RanC_cyt_average	RanC_cyt_total	RanC_cyt_min	RanC_cyt_max	RanC_nuc_average	RanC_nuc_total	RanC_nuc_min	RanC_nuc_max
+    0.0	0.0	0.0	0.0	2.2250738585072014E-308	0.0	0.0	0.0	2.2250738585072014E-308	0.0	0.0	0.0	2.2250738585072014E-308	4.500000000000469E-4	995.2639514331112	4.5E-4	4.5E-4
+    0.25	1.7242715861760507E-6	15.424394116394046	0.0	1.6578610937269188E-5	1.7242715861760507E-6	15.424394116394046	0.0	1.6578610937269188E-5	1.207791769681895E-5	108.04247089303067	0.0	1.386595389472678E-4	3.941755233129602E-4	871.7970864237174	2.618557008421516E-4	4.4784349464175205E-4
+    0.5	5.558151571952027E-6	49.720195525909354	0.0	3.688810690038264E-5	5.558151571952027E-6	49.720195525909354	0.0	3.688810690038264E-5	1.8059551583630324E-5	161.55090846739807	0.0	1.5070593618817915E-4	3.5447559498153353E-4	783.992847439835	2.1160422746379327E-4	4.325308047580796E-4
+    0.75	1.047028801407123E-5	93.66149169073127	0.0	5.838639163921412E-5	1.047028801407123E-5	93.66149169073127	0.0	5.838639163921412E-5	2.1237498474550513E-5	189.9790898046723	0.0	1.518858395444779E-4	3.217543607511082E-4	711.6233699377424	1.8374881412946774E-4	4.084354763369491E-4
+    1.0	1.5889249586954565E-5	142.1365693245928	0.0	7.973304853048764E-5	1.5889249586954565E-5	142.1365693245928	0.0	7.973304853048764E-5	2.277124484748409E-5	203.69914917373157	0.0	1.4872052622450286E-4	2.9363336670624725E-4	649.4282329348212	1.6516455078631075E-4	3.8171626169331387E-4
+    """
+    # parse table from vcell plot into a np.ndarray
+    stats_lines = stats_table_from_vcell_plot.strip().split("\n")
+    stats_data = []
+    for line in stats_lines[1:]:
+        parts = line.split("\t")
+        stats_data.append([float(v) for v in parts])
+    stats_data = np.array(stats_data)
+
+    # remove first column of times and reshape to look like the post_processing.statistics array
+    stats_data = stats_data[:, 1:]
+    stats_data = stats_data.reshape((5, 4, 4))
+
+    assert np.allclose(post_processing.statistics, stats_data)
+
+    assert post_processing.statistics.shape == (5, 4, 4)
+
+    # compare individual statistics for C_cyt (first variable): average, total, min, max to known values
+    expected_C_cyt_average = np.array([[0.00000000e+00, 1.72427159e-06, 5.55815157e-06, 1.04702880e-05, 1.58892496e-05]])
+    C_cyt_average = post_processing.statistics[:, 0, int(StatisticType.AVERAGE)]
+    assert np.allclose(C_cyt_average, expected_C_cyt_average)
+
+    expected_C_cyt_total = np.array([[[[  0., 15.42439412,  49.72019553,  93.66149169, 142.13656932]]]])
+    C_cyt_total = post_processing.statistics[:, 0, int(StatisticType.TOTAL)]
+    assert np.allclose(C_cyt_total, expected_C_cyt_total)
+
+    expected_C_cyt_min = np.array([[0., 0., 0., 0., 0.]])
+    C_cyt_min = post_processing.statistics[:, 0, int(StatisticType.MIN)]
+    assert np.allclose(C_cyt_min, expected_C_cyt_min)
+
+    expected_C_cyt_max = np.array([[[2.22507386e-308, 1.65786109e-005, 3.68881069e-005, 5.83863916e-005, 7.97330485e-005]]])
+    C_cyt_max = post_processing.statistics[:, 0, int(StatisticType.MAX)]
+    assert np.allclose(C_cyt_max, expected_C_cyt_max)
+
+    fluorescence: ImageMetadata = post_processing.image_metadata[0]
+    assert fluorescence.name == "fluor"
+    assert fluorescence.shape == (71, 71)
+    assert np.allclose(fluorescence.extents, np.array([74.24, 74.24, 26.0]))
+    assert np.allclose(fluorescence.origin, np.array([0.0, 0.0, 0.0]))
+    assert fluorescence.group_path == "/PostProcessing/fluor"
+
+    fluorescence_data_0 = post_processing.read_image_data(image_metadata=fluorescence, time_index=0)
+    assert fluorescence_data_0.shape == (71, 71)
+    assert fluorescence_data_0.dtype == np.float64
+    assert np.min(fluorescence_data_0) == 0.0
+    assert np.max(fluorescence_data_0) == 0.0
+
+    fluorescence_data_4 = post_processing.read_image_data(image_metadata=fluorescence, time_index=4)
+    assert fluorescence_data_4.shape == (71, 71)
+    assert fluorescence_data_4.dtype == np.float64
+    assert np.min(fluorescence_data_4) == 0.0
+    assert np.allclose(np.max(fluorescence_data_4), 0.7147863306841433)
diff --git a/pythonData/vcelldata/postprocessing.py b/pythonData/vcelldata/postprocessing.py
@@ -0,0 +1,155 @@
+from enum import IntEnum
+from pathlib import Path
+
+import numpy as np
+from h5py import Dataset, File as H5File, Group
+
+
+class StatisticType(IntEnum):
+    AVERAGE = 0
+    TOTAL = 1
+    MIN = 2
+    MAX = 3
+
+
+class ImageMetadata:
+    name: str
+    group_path: str
+    extents: np.ndarray
+    origin: np.ndarray
+    shape: tuple[int]
+
+    def __init__(self, name: str, group_path: str):
+        self.name = name
+        self.group_path = group_path
+
+    def get_dataset(self, hdf5File: H5File, time_index: int) -> Dataset:
+        image_group: Group = hdf5File[self.group_path]
+        image_ds: Dataset = image_group[f'time{time_index:06d}']
+        return image_ds
+
+    def read(self, f: H5File):
+        image_group: Group = f[self.group_path]
+
+        # get attributes from the group
+        extents_list = []
+        origin_list = []
+        if "ExtentX" in image_group.attrs:
+            extents_list.append(image_group.attrs['ExtentX'])
+        if "ExtentY" in image_group.attrs:
+            extents_list.append(image_group.attrs['ExtentY'])
+        if "ExtentZ" in image_group.attrs:
+            extents_list.append(image_group.attrs['ExtentZ'])
+        if "OriginX" in image_group.attrs:
+            origin_list.append(image_group.attrs['OriginX'])
+        if "OriginY" in image_group.attrs:
+            origin_list.append(image_group.attrs['OriginY'])
+        if "OriginZ" in image_group.attrs:
+            origin_list.append(image_group.attrs['OriginZ'])
+        self.extents = np.array(extents_list)
+        self.origin = np.array(origin_list)
+        self.shape = self.get_dataset(f, 0).shape
+
+
+class VariableInfo:
+    var_index: int
+    var_name: str  # e.g. "C_cyt"
+    stat_channel: int
+    statistic_type: StatisticType  # e.g. StatisticType.AVERAGE
+    stat_var_name: str  # e.g. "C_cyt_average"
+    stat_var_unit: str      # e.g. "uM"
+
+    def __init__(self, stat_var_name: str, stat_var_unit: str, stat_channel: int, var_index: int):
+        self.stat_var_name = stat_var_name
+        self.stat_var_unit = stat_var_unit
+        self.stat_channel = stat_channel
+        self.var_index = var_index
+        # stat_var_name is in the form of "C_cyt_average" so remove _average to get the variable name
+        stat_type_raw = stat_var_name.split("_")[-1]
+        self.statistic_type = StatisticType[stat_type_raw.upper()]
+        self.var_name = stat_var_name.replace("_"+stat_type_raw, "")
+
+
+class PostProcessing:
+    postprocessing_hdf5_path: Path
+    times: np.ndarray
+    variables: list[VariableInfo]
+    statistics: np.ndarray  # shape (times, vars, stats) where status is average=0, total=1, min=2, max=3
+    image_metadata: list[ImageMetadata]
+
+    def __init__(self, postprocessing_hdf5_path: Path):
+        self.postprocessing_hdf5_path = postprocessing_hdf5_path
+        self.variables = []
+        self.image_metadata = []
+
+
+    def read(self) -> None:
+        # read the file as hdf5
+        with H5File(self.postprocessing_hdf5_path, 'r') as f:
+            # read dataset with path /PostProcessing/Times
+            times_ds: Dataset = f['/PostProcessing/Times']
+            # read array from times dataset into a ndarray
+            self.times = times_ds[()]
+
+            # read attributes from group /PostProcessing/VariableStatistics
+            # data is flat, so we can read the attributes directly, so name and units for each channel are separate
+            #
+            # key=comp_0_name, value=b'C_cyt_average'
+            # key=comp_0_unit, value=b'uM'
+            # key=comp_1_name, value=b'C_cyt_total'
+            # key=comp_1_unit, value=b'molecules'
+            # key=comp_2_name, value=b'C_cyt_min'
+            # key=comp_2_unit, value=b'uM'
+            # key=comp_3_name, value=b'C_cyt_max'
+            # key=comp_3_unit, value=b'uM'
+            #
+            var_stats_grp = f['/PostProcessing/VariableStatistics']
+            # gather stat_var_name and stat_var_unit for each channel into dictionaries by channel
+            var_name_by_channel: dict[int, str] = {}
+            var_unit_by_channel: dict[int, str] = {}
+            for k, v in var_stats_grp.attrs.items():
+                parts = k.split('_')
+                channel = int(parts[1])
+                value = v.decode('utf-8')
+                if parts[2] == "name":
+                    var_name_by_channel[channel] = value
+                elif parts[2] == "unit":
+                    var_unit_by_channel[channel] = value
+            # combine into a single list of VariableInfo objects, one for each channel
+            self.variables = [VariableInfo(stat_var_name=var_name_by_channel[i],
+                                           stat_var_unit=var_unit_by_channel[i],
+                                           stat_channel=i,
+                                           var_index=i//4)
+                              for i in range(len(var_name_by_channel))]
+
+            # within /PostProcessing/VariableStatistics, there are datasets for each time point
+            # PostProcessing/VariableStatistics
+            # PostProcessing/VariableStatistics/time000000
+            # PostProcessing/VariableStatistics/time000001
+            # PostProcessing/VariableStatistics/time000002
+            # PostProcessing/VariableStatistics/time000003
+            # PostProcessing/VariableStatistics/time000004
+
+            # we can read the data for each time point into a list of ndarrays
+            statistics_raw: np.ndarray = np.zeros((len(self.times), len(self.variables)))
+            for time_index in range(len(self.times)):
+                time_ds: Dataset = var_stats_grp[f'time{time_index:06d}']
+                statistics_raw[time_index, :] = time_ds[()]
+
+            # reshape the statistics_raw into a 3D array (times, vars, stats)
+            self.statistics = statistics_raw.reshape((len(self.times), len(self.variables)//4, 4))
+
+            # get list of child groups from /PostProcessing which are not Times or VariableStatistics
+            # e.g. /PostProcessing/fluor
+            image_groups = [k for k in f['/PostProcessing'].keys() if k not in ['Times', 'VariableStatistics']]
+
+            # for each image group, read the metadata to allow reading later
+            for image_group in image_groups:
+                metadata = ImageMetadata(group_path=f"/PostProcessing/{image_group}", name=image_group)
+                metadata.read(f)
+                self.image_metadata.append(metadata)
+
+    def read_image_data(self, image_metadata: ImageMetadata, time_index: int) -> np.ndarray:
+        with H5File(self.postprocessing_hdf5_path, 'r') as f:
+            image_ds: Dataset = image_metadata.get_dataset(hdf5File=f, time_index=time_index)
+            return image_ds[()]