Skip to content

Commit

Permalink
read post-processing from hdf5 - var statistics and microscopy data
Browse files Browse the repository at this point in the history
  • Loading branch information
jcschaff committed May 2, 2024
1 parent b7e1ccc commit 4c18146
Show file tree
Hide file tree
Showing 5 changed files with 449 additions and 30 deletions.
180 changes: 154 additions & 26 deletions pythonData/example.ipynb

Large diffs are not rendered by default.

39 changes: 38 additions & 1 deletion pythonData/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pythonData/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ numexpr = "^2.9.0"
zarr = "^2.17.0"
jupyter = "^1.0.0"
matplotlib = "^3.8.3"
h5py = "^3.10.0"


[tool.poetry.group.dev.dependencies]
Expand Down
104 changes: 101 additions & 3 deletions pythonData/tests/vcell_parsing_test.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import numpy as np
from pathlib import Path
import tarfile
from pathlib import Path

import numpy as np

from vcelldata.simdata_models import PdeDataSet, DataFunctions, NamedFunction, VariableType
from vcelldata.mesh import CartesianMesh
from vcelldata.postprocessing import ImageMetadata, PostProcessing, VariableInfo, StatisticType
from vcelldata.simdata_models import PdeDataSet, DataFunctions, NamedFunction, VariableType

test_data_dir = (Path(__file__).parent.parent / "test_data").absolute()

Expand Down Expand Up @@ -147,3 +149,99 @@ def test_mesh_parse():

mesh = CartesianMesh(mesh_file=test_data_dir / "SimID_946368938_0_.mesh")
mesh.read()


def test_post_processing_parse():
extract_simdata()

post_processing = PostProcessing(postprocessing_hdf5_path=test_data_dir / "SimID_946368938_0_.hdf5")
post_processing.read()

expected_times = [0.0, 0.25, 0.5, 0.75, 1.0]
assert np.allclose(post_processing.times, expected_times)

expected_variables = [
VariableInfo(stat_var_name="C_cyt_average", stat_var_unit="uM", stat_channel=0, var_index=0),
VariableInfo(stat_var_name="C_cyt_total", stat_var_unit="molecules", stat_channel=1, var_index=0),
VariableInfo(stat_var_name="C_cyt_min", stat_var_unit="uM", stat_channel=2, var_index=0),
VariableInfo(stat_var_name="C_cyt_max", stat_var_unit="uM", stat_channel=3, var_index=0),
VariableInfo(stat_var_name="Ran_cyt_average", stat_var_unit="uM", stat_channel=4, var_index=1),
VariableInfo(stat_var_name="Ran_cyt_total", stat_var_unit="molecules", stat_channel=5, var_index=1),
VariableInfo(stat_var_name="Ran_cyt_min", stat_var_unit="uM", stat_channel=6, var_index=1),
VariableInfo(stat_var_name="Ran_cyt_max", stat_var_unit="uM", stat_channel=7, var_index=1),
VariableInfo(stat_var_name="RanC_cyt_average", stat_var_unit="uM", stat_channel=8, var_index=2),
VariableInfo(stat_var_name="RanC_cyt_total", stat_var_unit="molecules", stat_channel=9, var_index=2),
VariableInfo(stat_var_name="RanC_cyt_min", stat_var_unit="uM", stat_channel=10, var_index=2),
VariableInfo(stat_var_name="RanC_cyt_max", stat_var_unit="uM", stat_channel=11, var_index=2),
VariableInfo(stat_var_name="RanC_nuc_average", stat_var_unit="uM", stat_channel=12, var_index=3),
VariableInfo(stat_var_name="RanC_nuc_total", stat_var_unit="molecules", stat_channel=13, var_index=3),
VariableInfo(stat_var_name="RanC_nuc_min", stat_var_unit="uM", stat_channel=14, var_index=3),
VariableInfo(stat_var_name="RanC_nuc_max", stat_var_unit="uM", stat_channel=15, var_index=3)
]
# compare each field of the VariableInfo objects
for i, v in enumerate(post_processing.variables):
expected = expected_variables[i]
assert v.stat_var_name == expected.stat_var_name
assert v.stat_var_unit == expected.stat_var_unit
assert v.stat_channel == expected.stat_channel
assert v.var_index == expected.var_index

stats_table_from_vcell_plot = """
t C_cyt_average C_cyt_total C_cyt_min C_cyt_max Ran_cyt_average Ran_cyt_total Ran_cyt_min Ran_cyt_max RanC_cyt_average RanC_cyt_total RanC_cyt_min RanC_cyt_max RanC_nuc_average RanC_nuc_total RanC_nuc_min RanC_nuc_max
0.0 0.0 0.0 0.0 2.2250738585072014E-308 0.0 0.0 0.0 2.2250738585072014E-308 0.0 0.0 0.0 2.2250738585072014E-308 4.500000000000469E-4 995.2639514331112 4.5E-4 4.5E-4
0.25 1.7242715861760507E-6 15.424394116394046 0.0 1.6578610937269188E-5 1.7242715861760507E-6 15.424394116394046 0.0 1.6578610937269188E-5 1.207791769681895E-5 108.04247089303067 0.0 1.386595389472678E-4 3.941755233129602E-4 871.7970864237174 2.618557008421516E-4 4.4784349464175205E-4
0.5 5.558151571952027E-6 49.720195525909354 0.0 3.688810690038264E-5 5.558151571952027E-6 49.720195525909354 0.0 3.688810690038264E-5 1.8059551583630324E-5 161.55090846739807 0.0 1.5070593618817915E-4 3.5447559498153353E-4 783.992847439835 2.1160422746379327E-4 4.325308047580796E-4
0.75 1.047028801407123E-5 93.66149169073127 0.0 5.838639163921412E-5 1.047028801407123E-5 93.66149169073127 0.0 5.838639163921412E-5 2.1237498474550513E-5 189.9790898046723 0.0 1.518858395444779E-4 3.217543607511082E-4 711.6233699377424 1.8374881412946774E-4 4.084354763369491E-4
1.0 1.5889249586954565E-5 142.1365693245928 0.0 7.973304853048764E-5 1.5889249586954565E-5 142.1365693245928 0.0 7.973304853048764E-5 2.277124484748409E-5 203.69914917373157 0.0 1.4872052622450286E-4 2.9363336670624725E-4 649.4282329348212 1.6516455078631075E-4 3.8171626169331387E-4
"""
# parse table from vcell plot into a np.ndarray
stats_lines = stats_table_from_vcell_plot.strip().split("\n")
stats_data = []
for line in stats_lines[1:]:
parts = line.split("\t")
stats_data.append([float(v) for v in parts])
stats_data = np.array(stats_data)

# remove first column of times and reshape to look like the post_processing.statistics array
stats_data = stats_data[:, 1:]
stats_data = stats_data.reshape((5, 4, 4))

assert np.allclose(post_processing.statistics, stats_data)

assert post_processing.statistics.shape == (5, 4, 4)

# compare individual statistics for C_cyt (first variable): average, total, min, max to known values
expected_C_cyt_average = np.array([[0.00000000e+00, 1.72427159e-06, 5.55815157e-06, 1.04702880e-05, 1.58892496e-05]])
C_cyt_average = post_processing.statistics[:, 0, int(StatisticType.AVERAGE)]
assert np.allclose(C_cyt_average, expected_C_cyt_average)

expected_C_cyt_total = np.array([[[[ 0., 15.42439412, 49.72019553, 93.66149169, 142.13656932]]]])
C_cyt_total = post_processing.statistics[:, 0, int(StatisticType.TOTAL)]
assert np.allclose(C_cyt_total, expected_C_cyt_total)

expected_C_cyt_min = np.array([[0., 0., 0., 0., 0.]])
C_cyt_min = post_processing.statistics[:, 0, int(StatisticType.MIN)]
assert np.allclose(C_cyt_min, expected_C_cyt_min)

expected_C_cyt_max = np.array([[[2.22507386e-308, 1.65786109e-005, 3.68881069e-005, 5.83863916e-005, 7.97330485e-005]]])
C_cyt_max = post_processing.statistics[:, 0, int(StatisticType.MAX)]
assert np.allclose(C_cyt_max, expected_C_cyt_max)

fluorescence: ImageMetadata = post_processing.image_metadata[0]
assert fluorescence.name == "fluor"
assert fluorescence.shape == (71, 71)
assert np.allclose(fluorescence.extents, np.array([74.24, 74.24, 26.0]))
assert np.allclose(fluorescence.origin, np.array([0.0, 0.0, 0.0]))
assert fluorescence.group_path == "/PostProcessing/fluor"

fluorescence_data_0 = post_processing.read_image_data(image_metadata=fluorescence, time_index=0)
assert fluorescence_data_0.shape == (71, 71)
assert fluorescence_data_0.dtype == np.float64
assert np.min(fluorescence_data_0) == 0.0
assert np.max(fluorescence_data_0) == 0.0

fluorescence_data_4 = post_processing.read_image_data(image_metadata=fluorescence, time_index=4)
assert fluorescence_data_4.shape == (71, 71)
assert fluorescence_data_4.dtype == np.float64
assert np.min(fluorescence_data_4) == 0.0
assert np.allclose(np.max(fluorescence_data_4), 0.7147863306841433)
155 changes: 155 additions & 0 deletions pythonData/vcelldata/postprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
from enum import IntEnum
from pathlib import Path

import numpy as np
from h5py import Dataset, File as H5File, Group


class StatisticType(IntEnum):
AVERAGE = 0
TOTAL = 1
MIN = 2
MAX = 3


class ImageMetadata:
name: str
group_path: str
extents: np.ndarray
origin: np.ndarray
shape: tuple[int]

def __init__(self, name: str, group_path: str):
self.name = name
self.group_path = group_path

def get_dataset(self, hdf5File: H5File, time_index: int) -> Dataset:
image_group: Group = hdf5File[self.group_path]
image_ds: Dataset = image_group[f'time{time_index:06d}']
return image_ds

def read(self, f: H5File):
image_group: Group = f[self.group_path]

# get attributes from the group
extents_list = []
origin_list = []
if "ExtentX" in image_group.attrs:
extents_list.append(image_group.attrs['ExtentX'])
if "ExtentY" in image_group.attrs:
extents_list.append(image_group.attrs['ExtentY'])
if "ExtentZ" in image_group.attrs:
extents_list.append(image_group.attrs['ExtentZ'])
if "OriginX" in image_group.attrs:
origin_list.append(image_group.attrs['OriginX'])
if "OriginY" in image_group.attrs:
origin_list.append(image_group.attrs['OriginY'])
if "OriginZ" in image_group.attrs:
origin_list.append(image_group.attrs['OriginZ'])
self.extents = np.array(extents_list)
self.origin = np.array(origin_list)
self.shape = self.get_dataset(f, 0).shape


class VariableInfo:
var_index: int
var_name: str # e.g. "C_cyt"
stat_channel: int
statistic_type: StatisticType # e.g. StatisticType.AVERAGE
stat_var_name: str # e.g. "C_cyt_average"
stat_var_unit: str # e.g. "uM"

def __init__(self, stat_var_name: str, stat_var_unit: str, stat_channel: int, var_index: int):
self.stat_var_name = stat_var_name
self.stat_var_unit = stat_var_unit
self.stat_channel = stat_channel
self.var_index = var_index
# stat_var_name is in the form of "C_cyt_average" so remove _average to get the variable name
stat_type_raw = stat_var_name.split("_")[-1]
self.statistic_type = StatisticType[stat_type_raw.upper()]
self.var_name = stat_var_name.replace("_"+stat_type_raw, "")


class PostProcessing:
postprocessing_hdf5_path: Path
times: np.ndarray
variables: list[VariableInfo]
statistics: np.ndarray # shape (times, vars, stats) where status is average=0, total=1, min=2, max=3
image_metadata: list[ImageMetadata]

def __init__(self, postprocessing_hdf5_path: Path):
self.postprocessing_hdf5_path = postprocessing_hdf5_path
self.variables = []
self.image_metadata = []


def read(self) -> None:
# read the file as hdf5
with H5File(self.postprocessing_hdf5_path, 'r') as f:
# read dataset with path /PostProcessing/Times
times_ds: Dataset = f['/PostProcessing/Times']
# read array from times dataset into a ndarray
self.times = times_ds[()]

# read attributes from group /PostProcessing/VariableStatistics
# data is flat, so we can read the attributes directly, so name and units for each channel are separate
#
# key=comp_0_name, value=b'C_cyt_average'
# key=comp_0_unit, value=b'uM'
# key=comp_1_name, value=b'C_cyt_total'
# key=comp_1_unit, value=b'molecules'
# key=comp_2_name, value=b'C_cyt_min'
# key=comp_2_unit, value=b'uM'
# key=comp_3_name, value=b'C_cyt_max'
# key=comp_3_unit, value=b'uM'
#
var_stats_grp = f['/PostProcessing/VariableStatistics']
# gather stat_var_name and stat_var_unit for each channel into dictionaries by channel
var_name_by_channel: dict[int, str] = {}
var_unit_by_channel: dict[int, str] = {}
for k, v in var_stats_grp.attrs.items():
parts = k.split('_')
channel = int(parts[1])
value = v.decode('utf-8')
if parts[2] == "name":
var_name_by_channel[channel] = value
elif parts[2] == "unit":
var_unit_by_channel[channel] = value
# combine into a single list of VariableInfo objects, one for each channel
self.variables = [VariableInfo(stat_var_name=var_name_by_channel[i],
stat_var_unit=var_unit_by_channel[i],
stat_channel=i,
var_index=i//4)
for i in range(len(var_name_by_channel))]

# within /PostProcessing/VariableStatistics, there are datasets for each time point
# PostProcessing/VariableStatistics
# PostProcessing/VariableStatistics/time000000
# PostProcessing/VariableStatistics/time000001
# PostProcessing/VariableStatistics/time000002
# PostProcessing/VariableStatistics/time000003
# PostProcessing/VariableStatistics/time000004

# we can read the data for each time point into a list of ndarrays
statistics_raw: np.ndarray = np.zeros((len(self.times), len(self.variables)))
for time_index in range(len(self.times)):
time_ds: Dataset = var_stats_grp[f'time{time_index:06d}']
statistics_raw[time_index, :] = time_ds[()]

# reshape the statistics_raw into a 3D array (times, vars, stats)
self.statistics = statistics_raw.reshape((len(self.times), len(self.variables)//4, 4))

# get list of child groups from /PostProcessing which are not Times or VariableStatistics
# e.g. /PostProcessing/fluor
image_groups = [k for k in f['/PostProcessing'].keys() if k not in ['Times', 'VariableStatistics']]

# for each image group, read the metadata to allow reading later
for image_group in image_groups:
metadata = ImageMetadata(group_path=f"/PostProcessing/{image_group}", name=image_group)
metadata.read(f)
self.image_metadata.append(metadata)

def read_image_data(self, image_metadata: ImageMetadata, time_index: int) -> np.ndarray:
with H5File(self.postprocessing_hdf5_path, 'r') as f:
image_ds: Dataset = image_metadata.get_dataset(hdf5File=f, time_index=time_index)
return image_ds[()]

0 comments on commit 4c18146

Please sign in to comment.