Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create NPZ file for training #99

Merged
merged 9 commits into from
Jul 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
112 changes: 112 additions & 0 deletions caliban_toolbox/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import math

import numpy as np

from deepcell_toolbox.utils import resize, tile_image


def pad_image_stack(images, crop_size):
"""Pads an an array of images so that it is divisible by the specified crop_size

Args:
images: array of images to be cropped
crop_size: tuple specifying crop size

Returns:
np.array: padded image stack
"""

row_len, col_len = images.shape[1:3]
row_crop, col_crop = crop_size
row_num = math.ceil(row_len / crop_size[0])
col_num = math.ceil(col_len / crop_size[1])

new_row_len = row_num * row_crop
new_col_len = col_num * col_crop

if new_row_len == row_len and new_col_len == col_len:
# don't need to pad
return images
else:
new_images = np.zeros((images.shape[0], new_row_len, new_col_len, images.shape[3]))
new_images[:, :row_len, :col_len, :] = images
return new_images


def combine_npz_files(npz_list, resize_ratios, stride_ratio=1, final_size=(256, 256)):
"""Take a series of NPZ files and combine together into single training NPZ

Args:
npz_list: list of NPZ files to combine. Currently only works on 2D static data
resize_ratios: ratio used to resize each NPZ if data is of different resolutions
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this meant to be a list or an array? does 1 indicate no scaling required?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is a list that matches the NPZ files being specified, where 1 indicates no resizing required

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a threshold for scaling. (e.g. 1.01 does not need to re-scaled)

stride_ratio: amount of overlap between crops (1 is no overlap, 0.5 is half crop size)
final_size: size of the final crops to be produced
Returns:
np.array: array containing resized and cropped data from all input NPZs
Raises:
ValueError: If resize ratios are not integers
"""

combined_x = []
combined_y = []

for idx, npz in enumerate(npz_list):
current_x = npz['X']
current_y = npz['y']

# resize if needed
ngreenwald marked this conversation as resolved.
Show resolved Hide resolved
# TODO: Add tolerance to control when resizing happens
current_resize = resize_ratios[idx]
if current_resize != 1:
new_shape = (int(current_x.shape[1] * current_resize),
int(current_x.shape[2] * current_resize))

current_x = resize(data=current_x, shape=new_shape)
current_y = resize(data=current_y, shape=new_shape, labeled_image=True)

# crop if needed
if current_x.shape[1:3] != final_size:

# pad image so that crops divide evenly
current_x = pad_image_stack(images=current_x, crop_size=final_size)
current_y = pad_image_stack(images=current_y, crop_size=final_size)

# create x and y crops
current_x, _ = tile_image(image=current_x, model_input_shape=final_size,
stride_ratio=stride_ratio)
current_y, _ = tile_image(image=current_y, model_input_shape=final_size,
stride_ratio=stride_ratio)

combined_x.append(current_x)
combined_y.append(current_y)

combined_x = np.concatenate(combined_x, axis=0)
combined_y = np.concatenate(combined_y, axis=0)

return combined_x, combined_y
135 changes: 135 additions & 0 deletions caliban_toolbox/build_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np

from caliban_toolbox import build


def _make_npzs(size, num):
npz_list = []

for i in range(num):
x = np.zeros((1, ) + size + (4, ))
y = np.zeros((1,) + size + (1,))
npz = {'X': x, 'y': y}

npz_list.append(npz)

return npz_list


def test_pad_image_stack():
# rows and cols both need to be modified
input_stack = np.zeros((2, 55, 55, 2))
tags = [1, 2]
input_stack[:, 0, 0, 0] = tags
crop_size = (10, 10)
padded_stack = build.pad_image_stack(images=input_stack, crop_size=crop_size)
assert padded_stack.shape == (2, 60, 60, 2)
assert np.all(padded_stack[:, 0, 0, 0] == tags)

# just rows need to be modified
input_stack = np.zeros((2, 50, 35, 2))
input_stack[:, 0, 0, 0] = tags
crop_size = (10, 10)
padded_stack = build.pad_image_stack(images=input_stack, crop_size=crop_size)
assert padded_stack.shape == (2, 50, 40, 2)
assert np.all(padded_stack[:, 0, 0, 0] == tags)

# neither needs to be modified
input_stack = np.zeros((2, 30, 50, 2))
input_stack[:, 0, 0, 0] = tags
crop_size = (10, 10)
padded_stack = build.pad_image_stack(images=input_stack, crop_size=crop_size)
assert padded_stack.shape == input_stack.shape
assert np.all(padded_stack[:, 0, 0, 0] == tags)


def test_combine_npz_files():
# NPZ files are appropriate size and resolution
npz_list = _make_npzs((256, 256), 2)
resize_ratios = [1] * 2
final_size = (256, 256)

combined_npz = build.combine_npz_files(npz_list=npz_list, resize_ratios=resize_ratios,
final_size=final_size)

combined_x, combined_y = combined_npz

# check that correct number of NPZs present
assert combined_x.shape[0] == len(npz_list)

# check correct size of NPZs
assert combined_x.shape[1:3] == final_size

# NPZ files need to be cropped
npz_crop_list = _make_npzs((512, 512), 3)
resize_ratios = [1] * 3
final_size = (256, 256)

combined_npz = build.combine_npz_files(npz_list=npz_crop_list, resize_ratios=resize_ratios,
final_size=final_size)

combined_x, combined_y = combined_npz

# check that correct number of NPZs present
assert combined_x.shape[0] == len(npz_crop_list) * 4

# check correct size of NPZs
assert combined_x.shape[1:3] == final_size

# NPZ files need to be resized
npz_resize_list = _make_npzs((256, 256), 5)
resize_ratios = [3] * 5
final_size = (256, 256)

combined_npz = build.combine_npz_files(npz_list=npz_resize_list, resize_ratios=resize_ratios,
final_size=final_size)

combined_x, combined_y = combined_npz

# check that correct number of NPZs present
assert combined_x.shape[0] == len(npz_resize_list) * (resize_ratios[0] ** 2)

# check correct size of NPZs
assert combined_x.shape[1:3] == final_size

# some need to be cropped, some need to be resized
npz_list = npz_crop_list + npz_resize_list
resize_ratios = [1] * 3 + [3] * 5
final_size = (256, 256)

combined_npz = build.combine_npz_files(npz_list=npz_list, resize_ratios=resize_ratios,
final_size=final_size)

combined_x, combined_y = combined_npz

# check that correct number of NPZs present
assert combined_x.shape[0] == (len(npz_crop_list) * 4 +
len(npz_resize_list) * (resize_ratios[4] ** 2))

# check correct size of NPZs
assert combined_x.shape[1:3] == final_size
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ boto3>=1.9.0
xarray==0.12.1
netCDF4==1.5.3
pathlib==1.0.1
deepcell-toolbox>=0.6.1