From cd56fac6acccec6138d46bce7c22881b662c55f6 Mon Sep 17 00:00:00 2001 From: Noah Greenwald Date: Mon, 20 Jul 2020 12:04:33 -0700 Subject: [PATCH] Create NPZ file for training (#99) * npz build function * added deepcell-toolbox * padding options * correct function name * update for cropping generator * Update requirements.txt * Update caliban_toolbox/build.py * Update caliban_toolbox/build.py --- caliban_toolbox/build.py | 112 ++++++++++++++++++++++++++++ caliban_toolbox/build_test.py | 135 ++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 248 insertions(+) create mode 100644 caliban_toolbox/build.py create mode 100644 caliban_toolbox/build_test.py diff --git a/caliban_toolbox/build.py b/caliban_toolbox/build.py new file mode 100644 index 0000000..a6bc768 --- /dev/null +++ b/caliban_toolbox/build.py @@ -0,0 +1,112 @@ +# Copyright 2016-2020 The Van Valen Lab at the California Institute of +# Technology (Caltech), with support from the Paul Allen Family Foundation, +# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01. +# All rights reserved. +# +# Licensed under a modified Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE +# +# The Work provided may be used for non-commercial academic purposes only. +# For any other use of the Work, including commercial use, please contact: +# vanvalenlab@gmail.com +# +# Neither the name of Caltech nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific +# prior written permission. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import math + +import numpy as np + +from deepcell_toolbox.utils import resize, tile_image + + +def pad_image_stack(images, crop_size): + """Pads an an array of images so that it is divisible by the specified crop_size + + Args: + images: array of images to be cropped + crop_size: tuple specifying crop size + + Returns: + np.array: padded image stack + """ + + row_len, col_len = images.shape[1:3] + row_crop, col_crop = crop_size + row_num = math.ceil(row_len / crop_size[0]) + col_num = math.ceil(col_len / crop_size[1]) + + new_row_len = row_num * row_crop + new_col_len = col_num * col_crop + + if new_row_len == row_len and new_col_len == col_len: + # don't need to pad + return images + else: + new_images = np.zeros((images.shape[0], new_row_len, new_col_len, images.shape[3])) + new_images[:, :row_len, :col_len, :] = images + return new_images + + +def combine_npz_files(npz_list, resize_ratios, stride_ratio=1, final_size=(256, 256)): + """Take a series of NPZ files and combine together into single training NPZ + + Args: + npz_list: list of NPZ files to combine. Currently only works on 2D static data + resize_ratios: ratio used to resize each NPZ if data is of different resolutions + stride_ratio: amount of overlap between crops (1 is no overlap, 0.5 is half crop size) + final_size: size of the final crops to be produced + Returns: + np.array: array containing resized and cropped data from all input NPZs + Raises: + ValueError: If resize ratios are not integers + """ + + combined_x = [] + combined_y = [] + + for idx, npz in enumerate(npz_list): + current_x = npz['X'] + current_y = npz['y'] + + # resize if needed + # TODO: Add tolerance to control when resizing happens + current_resize = resize_ratios[idx] + if current_resize != 1: + new_shape = (int(current_x.shape[1] * current_resize), + int(current_x.shape[2] * current_resize)) + + current_x = resize(data=current_x, shape=new_shape) + current_y = resize(data=current_y, shape=new_shape, labeled_image=True) + + # crop if needed + if current_x.shape[1:3] != final_size: + + # pad image so that crops divide evenly + current_x = pad_image_stack(images=current_x, crop_size=final_size) + current_y = pad_image_stack(images=current_y, crop_size=final_size) + + # create x and y crops + current_x, _ = tile_image(image=current_x, model_input_shape=final_size, + stride_ratio=stride_ratio) + current_y, _ = tile_image(image=current_y, model_input_shape=final_size, + stride_ratio=stride_ratio) + + combined_x.append(current_x) + combined_y.append(current_y) + + combined_x = np.concatenate(combined_x, axis=0) + combined_y = np.concatenate(combined_y, axis=0) + + return combined_x, combined_y diff --git a/caliban_toolbox/build_test.py b/caliban_toolbox/build_test.py new file mode 100644 index 0000000..aae3644 --- /dev/null +++ b/caliban_toolbox/build_test.py @@ -0,0 +1,135 @@ +# Copyright 2016-2020 The Van Valen Lab at the California Institute of +# Technology (Caltech), with support from the Paul Allen Family Foundation, +# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01. +# All rights reserved. +# +# Licensed under a modified Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE +# +# The Work provided may be used for non-commercial academic purposes only. +# For any other use of the Work, including commercial use, please contact: +# vanvalenlab@gmail.com +# +# Neither the name of Caltech nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific +# prior written permission. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import numpy as np + +from caliban_toolbox import build + + +def _make_npzs(size, num): + npz_list = [] + + for i in range(num): + x = np.zeros((1, ) + size + (4, )) + y = np.zeros((1,) + size + (1,)) + npz = {'X': x, 'y': y} + + npz_list.append(npz) + + return npz_list + + +def test_pad_image_stack(): + # rows and cols both need to be modified + input_stack = np.zeros((2, 55, 55, 2)) + tags = [1, 2] + input_stack[:, 0, 0, 0] = tags + crop_size = (10, 10) + padded_stack = build.pad_image_stack(images=input_stack, crop_size=crop_size) + assert padded_stack.shape == (2, 60, 60, 2) + assert np.all(padded_stack[:, 0, 0, 0] == tags) + + # just rows need to be modified + input_stack = np.zeros((2, 50, 35, 2)) + input_stack[:, 0, 0, 0] = tags + crop_size = (10, 10) + padded_stack = build.pad_image_stack(images=input_stack, crop_size=crop_size) + assert padded_stack.shape == (2, 50, 40, 2) + assert np.all(padded_stack[:, 0, 0, 0] == tags) + + # neither needs to be modified + input_stack = np.zeros((2, 30, 50, 2)) + input_stack[:, 0, 0, 0] = tags + crop_size = (10, 10) + padded_stack = build.pad_image_stack(images=input_stack, crop_size=crop_size) + assert padded_stack.shape == input_stack.shape + assert np.all(padded_stack[:, 0, 0, 0] == tags) + + +def test_combine_npz_files(): + # NPZ files are appropriate size and resolution + npz_list = _make_npzs((256, 256), 2) + resize_ratios = [1] * 2 + final_size = (256, 256) + + combined_npz = build.combine_npz_files(npz_list=npz_list, resize_ratios=resize_ratios, + final_size=final_size) + + combined_x, combined_y = combined_npz + + # check that correct number of NPZs present + assert combined_x.shape[0] == len(npz_list) + + # check correct size of NPZs + assert combined_x.shape[1:3] == final_size + + # NPZ files need to be cropped + npz_crop_list = _make_npzs((512, 512), 3) + resize_ratios = [1] * 3 + final_size = (256, 256) + + combined_npz = build.combine_npz_files(npz_list=npz_crop_list, resize_ratios=resize_ratios, + final_size=final_size) + + combined_x, combined_y = combined_npz + + # check that correct number of NPZs present + assert combined_x.shape[0] == len(npz_crop_list) * 4 + + # check correct size of NPZs + assert combined_x.shape[1:3] == final_size + + # NPZ files need to be resized + npz_resize_list = _make_npzs((256, 256), 5) + resize_ratios = [3] * 5 + final_size = (256, 256) + + combined_npz = build.combine_npz_files(npz_list=npz_resize_list, resize_ratios=resize_ratios, + final_size=final_size) + + combined_x, combined_y = combined_npz + + # check that correct number of NPZs present + assert combined_x.shape[0] == len(npz_resize_list) * (resize_ratios[0] ** 2) + + # check correct size of NPZs + assert combined_x.shape[1:3] == final_size + + # some need to be cropped, some need to be resized + npz_list = npz_crop_list + npz_resize_list + resize_ratios = [1] * 3 + [3] * 5 + final_size = (256, 256) + + combined_npz = build.combine_npz_files(npz_list=npz_list, resize_ratios=resize_ratios, + final_size=final_size) + + combined_x, combined_y = combined_npz + + # check that correct number of NPZs present + assert combined_x.shape[0] == (len(npz_crop_list) * 4 + + len(npz_resize_list) * (resize_ratios[4] ** 2)) + + # check correct size of NPZs + assert combined_x.shape[1:3] == final_size diff --git a/requirements.txt b/requirements.txt index 87a5ce5..40f760f 100755 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ boto3>=1.9.0 xarray==0.13.0 netCDF4==1.5.3 pathlib==1.0.1 +deepcell-toolbox>=0.6.1