-
Notifications
You must be signed in to change notification settings - Fork 1
/
build.py
263 lines (207 loc) · 10.2 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import warnings
import math
import numpy as np
from skimage.measure import regionprops_table
from sklearn.model_selection import train_test_split
from deepcell_toolbox.utils import resize, tile_image
def compute_cell_size(npz_file, method='median', by_image=True):
"""Computes the typical cell size from a stack of labeled data
Args:
npz_file: Paired X and y data
method: one of (mean, median) used to compute the cell size
by_image: if true, cell size is reported for each image in npz_file. Otherwise,
the cell size across the entire npz is returned
Returns:
list: list of typical cell size in NPZ. If no cells, returns None.
Raises: ValueError if invalid method supplied
Raises: ValueError if data does have len(shape) of 4
"""
valid_methods = {'median', 'mean'}
method = str(method).lower()
if method not in valid_methods:
raise ValueError('Invalid method supplied: got {}, '
'method must be one of {}'.format(method, valid_methods))
# initialize variables
cell_sizes = []
labels = npz_file['y']
if len(labels.shape) != 4:
raise ValueError('Labeled data must be 4D')
for i in range(labels.shape[0]):
current_label = labels[i, :, :, 0]
# check to make sure array contains cells
if len(np.unique(current_label)) > 1:
area = regionprops_table(current_label.astype('int'), properties=['area'])['area']
cell_sizes.append(area)
# if all images were empty, return NA
if cell_sizes == []:
return None
# compute for each list corresponding to each image
if by_image:
average_cell_sizes = []
for size_list in cell_sizes:
if method == 'mean':
average_cell_sizes.append(np.mean(size_list))
elif method == 'median':
average_cell_sizes.append(np.median(size_list))
# compute across all lists from all images
else:
all_cell_sizes = np.concatenate(cell_sizes)
if method == 'mean':
average_cell_sizes = np.mean(all_cell_sizes)
elif method == 'median':
average_cell_sizes = np.median(all_cell_sizes)
else:
raise ValueError('Invalid method supplied')
return average_cell_sizes
def reshape_training_data(X_data, y_data, resize_ratio, final_size, stride_ratio=1, tolerance=1.5):
"""Takes a stack of X and y data and reshapes and crops them to match output dimensions
Args:
X_data: 4D numpy array of image data
y_data: 4D numpy array of labeled data
resize_ratio: resize ratio for the images
final_size: the desired shape of the output image
stride_ratio: amount of overlap between crops (1 is no overlap, 0.5 is half crop size)
tolerance: ratio that determines when resizing occurs
Returns:
reshaped_X, reshaped_y: resized and cropped version of input images
Raises:
ValueError: If image data is not 4D
"""
if len(X_data.shape) != 4:
raise ValueError('Image data must be 4D')
# resize if needed
if resize_ratio > tolerance or resize_ratio < (1 / tolerance):
new_shape = (int(X_data.shape[1] * resize_ratio),
int(X_data.shape[2] * resize_ratio))
X_data = resize(data=X_data, shape=new_shape)
y_data = resize(data=y_data, shape=new_shape, labeled_image=True)
# crop if needed
if X_data.shape[1:3] != final_size:
# pad image so that crops divide evenly
X_data = pad_image_stack(images=X_data, crop_size=final_size)
y_data = pad_image_stack(images=y_data, crop_size=final_size)
# create x and y crops
X_data, _ = tile_image(image=X_data, model_input_shape=final_size,
stride_ratio=stride_ratio)
y_data, _ = tile_image(image=y_data, model_input_shape=final_size,
stride_ratio=stride_ratio)
return X_data, y_data
def pad_image_stack(images, crop_size):
"""Pads an an array of images so that it is divisible by the specified crop_size
Args:
images: array of images to be cropped
crop_size: tuple specifying crop size
Returns:
np.array: padded image stack
Raises:
ValueError: If images are not 4D
"""
if len(images.shape) != 4:
raise ValueError('Image data must be 4D')
row_len, col_len = images.shape[1:3]
row_crop, col_crop = crop_size
row_num = math.ceil(row_len / crop_size[0])
col_num = math.ceil(col_len / crop_size[1])
new_row_len = row_num * row_crop
new_col_len = col_num * col_crop
if new_row_len == row_len and new_col_len == col_len:
# don't need to pad
return images
else:
new_images = np.zeros((images.shape[0], new_row_len, new_col_len, images.shape[3]),
dtype=images.dtype)
new_images[:, :row_len, :col_len, :] = images
return new_images
def train_val_test_split(X_data, y_data, data_split=(0.8, 0.1, 0.1), seed=None):
"""Randomly splits supplied data into specified sizes for model assessment
Args:
X_data: array of X data
y_data: array of y_data
data_split: tuple specifying the fraction of the dataset for train/val/test
seed: random seed for reproducible splits
Returns:
list of X and y data split appropriately. If dataset is too small for all splits,
returns None for remaining splits
Raises:
ValueError: if ratios do not sum to 1
ValueError: if any of the splits are 0
ValueError: If length of X and y data is not equal
"""
total = np.round(np.sum(data_split), decimals=2)
if total != 1:
raise ValueError('Data splits must sum to 1, supplied splits sum to {}'.format(total))
if 0 in data_split:
raise ValueError('All splits must be non-zero')
if X_data.shape[0] != y_data.shape[0]:
raise ValueError('Supplied X and y data do not have the same '
'length over batches dimension. '
'X.shape: {}, y.shape: {}'.format(X_data.shape, y_data.shape))
train_ratio, val_ratio, test_ratio = data_split
# 1 image, train split only
if X_data.shape[0] == 1:
warnings.warn('Only one image in current NPZ, returning training split only')
return X_data, y_data, None, None, None, None
# 2 images, train and val split only
if X_data.shape[0] == 2:
warnings.warn('Only two images in current NPZ, returning training and val split only')
return X_data[:1, ...], y_data[:1, ...], X_data[1:, ...], y_data[1:, ...], None, None
# compute fraction not in train
val_remainder_ratio = np.round(1 - train_ratio, decimals=2)
val_remainder_count = X_data.shape[0] * val_remainder_ratio
# not enough data for val split, put minimum (1) in each split
if val_remainder_count < 1:
warnings.warn('Not enough data in current NPZ for specified data split.'
'Returning modified data split')
X_train, X_remainder, y_train, y_remainder = train_test_split(X_data, y_data,
test_size=2,
random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_remainder, y_remainder,
test_size=1,
random_state=seed)
return X_train, y_train, X_val, y_val, X_test, y_test
# split dataset into train and remainder
X_train, X_remainder, y_train, y_remainder = train_test_split(X_data, y_data,
test_size=val_remainder_ratio,
random_state=seed)
# compute fraction of remainder that is test
test_remainder_ratio = np.round(test_ratio / (val_ratio + test_ratio), decimals=2)
test_remainder_count = X_remainder.shape[0] * test_remainder_ratio
# not enough data in remainder for test split, put minimum (1) in test split from train split
if test_remainder_count < 1:
warnings.warn('Not enough data in current NPZ for specified data split.'
'Returning modified data split')
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
test_size=1,
random_state=seed)
X_val, y_val = X_remainder, y_remainder
return X_train, y_train, X_val, y_val, X_test, y_test
# split remainder into val and test
X_val, X_test, y_val, y_test = train_test_split(X_remainder, y_remainder,
test_size=test_remainder_ratio,
random_state=seed)
return X_train, y_train, X_val, y_val, X_test, y_test