Skip to content

Commit

Permalink
different output shapes for each split
Browse files Browse the repository at this point in the history
  • Loading branch information
ngreenwald committed Aug 27, 2020
1 parent 0cd5194 commit 50da299
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 3 deletions.
19 changes: 16 additions & 3 deletions caliban_toolbox/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,8 @@ def build_dataset(self, tissues='all', platforms='all', output_shape=(512, 512),
a single tissue type, or 'all'
platforms: which platforms to include. Must be either a list of platform types,
a single platform type, or 'all'
output_shape: output shape for dataset
output_shape: output shape for dataset. Either a single tuple, in which case
train/va/test will all have same size, or a list of three tuples
resize: flag to control resizing the input data.
Valid arguments:
- False. No resizing
Expand Down Expand Up @@ -483,6 +484,17 @@ def build_dataset(self, tissues='all', platforms='all', output_shape=(512, 512),
if resize not in valid_resize:
raise ValueError('resize must be one of {}'.format(valid_resize))

if not isinstance(output_shape, (list, tuple)):
raise ValueError('output_shape must be either a list of tuples or a tuple')

# convert from single tuple to list of tuples for each split
if isinstance(output_shape, tuple):
output_shape = [output_shape, output_shape, output_shape]

for tup in output_shape:
if len(tup) != 2:
raise ValueError('Each output_shape must be len(2) tuple, got {}'.format(tup))

# if any of the split parameters are different we need to reload the dataset
if self.seed != seed or self.data_split != data_split:
self._load_all_experiments(data_split=data_split, seed=seed)
Expand All @@ -494,13 +506,14 @@ def build_dataset(self, tissues='all', platforms='all', output_shape=(512, 512),
# subset dict to include only relevant tissues and platforms
current_dict = self._subset_data_dict(data_dict=current_dict, tissues=tissues,
platforms=platforms)
current_shape = output_shape[idx]

# if necessary, reshape and resize data to be of correct output size
if current_dict['X'].shape[1:3] != output_shape or resize is not False:
if current_dict['X'].shape[1:3] != current_shape or resize is not False:
resize_target = kwargs.get('resize_target', 400)
resize_tolerance = kwargs.get('resize_tolerance', 1.5)
current_dict = self._reshape_dict(data_dict=current_dict, resize=resize,
output_shape=output_shape,
output_shape=current_shape,
resize_target=resize_target,
resize_tolerance=resize_tolerance)
# clean labels
Expand Down
8 changes: 8 additions & 0 deletions caliban_toolbox/dataset_builder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,14 @@ def test_build_dataset(tmp_path):
for current_dict in output_dicts:
assert len(np.unique(current_dict['y'])) == 2

# different sizes for different splits
output_dicts_diff_sizes = db.build_dataset(tissues=tissues, platforms=platforms,
output_shape=[(10, 10), (15, 15), (20, 20)])

assert output_dicts_diff_sizes[0]['X'].shape[1:3] == (10, 10)
assert output_dicts_diff_sizes[1]['X'].shape[1:3] == (15, 15)
assert output_dicts_diff_sizes[2]['X'].shape[1:3] == (20, 20)

# full runthrough with default options changed
_ = db.build_dataset(tissues=tissues, platforms=platforms, output_shape=(10, 10),
relabel_hard=True, resize='by_image', small_object_threshold=5)
Expand Down

0 comments on commit 50da299

Please sign in to comment.