In [1]:
import h5py
import starter_code.utils as starter
import starter_code.visualize as vis
import pandas as pd
import numpy as np
import tqdm

In [2]:
data = pd.read_csv("data_stats.csv")

In [3]:
max_val = data['max_val'].quantile(0.95)
min_val = data['min_val'].quantile(0.01)

In [4]:
crops = h5py.File("crops.hdf5", "w")

In [5]:
SIZE = 64

In [6]:
def normalize(volume):
    # Scale to values between 0 and 1
    mxval = max_val
    mnval = min_val
    im_volume = (volume - mnval)/max(mxval - mnval, 1e-3)
    return im_volume

In [7]:
def positions(im_shape, crop_shape, pads):
        zi, xi, yi = im_shape
        zc, xc, yc = crop_shape
        zp, xp, yp = pads
        for z in [z for z in range(0, zi, zp) if z + zc <= zi]:
            for x in [x for x in range(0, xi, xp) if x + xc <= xi]:
                for y in [y for y in range(0, yi, yp) if y + yc <= yi]:
                    yield (z, x, y)

In [8]:
# 5 for local run. replace with len(data) for cloud
crops_data = []
for i in tqdm.tqdm_notebook(range(1)):
    row = data.iloc[i]
    case_id, z, x, y = row['case_id'], row['num_slices'], row['height'], row['width']
    im, mask = starter.load_case(case_id)
    im, mask = im.get_data(), mask.get_data()
    pad_size = SIZE - im.shape[0] % SIZE 
    im = normalize(im)
    im = np.pad(im, ((pad_size, 0), (0, 0), (0, 0)), 'constant')
    mask = np.pad(mask, ((pad_size, 0), (0, 0), (0, 0)), 'constant')
    for position in positions((int(z + pad_size) , int(x), int(y)), (64, 64, 64), (32, 32, 32)):
        z, x, y = position
        crop = mask[z:z+64, x:x+64,y:y+64]
        kid_size = np.sum(crop == 1)
        tumor_size = np.sum(crop == 2)
        crops_data.append([case_id, position, 64, 32, kid_size, tumor_size])
    crops.create_dataset(case_id, data=np.array([im, mask]))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [9]:
crops_data = pd.DataFrame(data=crops_data, columns=["case_id", "position", "window", "padding", "kid_size", "tumor_size"])

In [12]:
crops.close()

In [88]:
crops_data.to_csv("crops.csv")