In [1]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dicom
import os
import glob
import scipy.ndimage
import matplotlib.pyplot as plt
import ipyvolume

from tqdm import tqdm, trange

import SimpleITK as sitk
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection



In [2]:


#####################
#
# Helper function to get rows in data frame associated 
# with each file

def get_origin(filepath):
    """Get the origin coordinates from a ITK file"""
    itk_img = sitk.ReadImage(filepath)        
    origin = np.array(itk_img.GetOrigin())
    return origin

def coord_to_ary_idx(coord, origin, verbose=False):
    """Hackish helper function to convert the coordinate (from dicom/luna) and origin to numpy indicies"""
    coord = np.array(coord)
    origin = np.array(origin)
    x, y, z = coord - origin
    absidx = x, y, z # i have no idea why these things use such crazy indexing. but this will match the numpy slicing dims
    if verbose:
        print('Absolute index: {}'.format(absidx))
    return list(map(int, absidx))

def get_fiducial_slice(coord, edgelen=48):
    ''' Gets the slicing indicies given a coordinate and edge length of a cube '''
    x, y, z = map(int, coord)
    m = edgelen // 2
    print(x+m, x-m, y+m, y-m, z+m, z-m)
    return (x-m, x+m, y-m, y+m, z-m, z+m)
    
def draw_fiducial_cube(ary_shape, coord, edgelen=48, dtype='int16'):
    """Draw a cube of size (E,E,E) located at coord, in a volume of shape ary_shape"""
    ary = np.ones(ary_shape, dtype=dtype)
    x0, x1, y0, y1, z0, z1 = get_fiducial_slice(coord, edgelen=edgelen)
    ary[:z0] = 0
    ary[z1:] = 0
    ary[:,:y0] = 0
    ary[:,y1:] = 0
    ary[:,:,:x0] = 0
    ary[:,:,x1:] = 0
    print(np.sum(ary))
    return ary
 
def get_filename(case):
    """Get the filepath from the UID"""
    global file_list # gross, yet effective...
    for f in file_list:
        if case in f:
            return(f)
        
def strip_uid(path):
    """Helper to convert path to UID"""
    fname = os.path.basename(path)
    return fname.strip('.mhd.npy')

In [3]:
def padcrop_vol(vol, newshape=[360, 360, 360], padtype='symmetric', value='origin'):
    """Pads and crops a volume in order to match the new shape. 
        padtype: {symmetric, origin} - pad symmetrically (on both sides) or only pad from the far index."""
    
    vol2 = np.array(vol)
    shape = vol.shape
    z, y, x = shape
    mids = [d // 2 for d in shape]
    if value == 'origin':
        constant_values = vol[0,0,0]
        print('Origin: ', constant_values)
    else:
        try:
            constant_values = float(value)
        except ValueError:
            raise ValueError('Invalid parameter "value" specified. Cannot coerce to symbol type or float')
        
    
    for dim in range(3):
        if shape[dim] < newshape[dim]:
            pad_amt = (newshape[dim] - shape[dim]) // 2
            parity = (shape[dim] & 1) ^ (newshape[dim] & 1)
            if padtype[:3] == 'sym':
                pad_tup = (pad_amt, pad_amt + parity) # 
            elif padtype[:3] == 'ori':
                pad_tup = (0, pad_amt + pad_amt + parity) 
            else:
                raise ValueError('Must specify valid padding mode: {"symmetric", "origin"}')
            pad_list = [(0,0), (0,0), (0,0)]
            pad_list[dim] = pad_tup
            vol2 = np.pad(vol2, pad_list, mode='constant', constant_values=constant_values)
        if shape[dim] > newshape[dim]:
            if  padtype[:3] != 'sym':
                raise NotImplementedError('Have not built this feature yet. Crop should be able to handle symmetric or origin')
            slc_amt = (shape[dim] - newshape[dim]) // 2
            parity = (shape[dim] & 1) ^ (newshape[dim] & 1)
            slc_tup = (slc_amt, shape[dim] - slc_amt - parity) # 
            null1, vol2, null2 = np.split(vol2, slc_tup, dim)

    return vol2

def subsect(a, edge_length=48, stride=0.5, serialize=True, verbose=False):
    '''Take a volume and chop it up to equal sized volumes of side edge_length. 
        serialize: if true, return an (N, E, E, E) dim array, E=edge, if false, return (M,N,P,E,E,E) dim array, where M, N, and P are the coordinates of the subsections in space'''
    nx, ny, nz = a.shape
    new_idx = [(nn // edge_length) if (nn%edge_length)==0 else (nn// edge_length)+1 for nn in a.shape ] # deal with the edge case of evenly divisible dim length
    if verbose: 
        print('New indicies: {}'.format(new_idx))
    new_shape = [edge_length*idx for idx in new_idx]
    a2 = padcrop_vol(a, newshape=new_shape)
    b = np.array(np.split(a2, new_idx[0], axis=0))
    b = np.array(np.split(b, new_idx[1], axis=2))
    b = np.array(np.split(b, new_idx[2], axis=4))
    if serialize:
        b = np.reshape(b, (-1, edge_length, edge_length, edge_length))

    return b, new_idx

def subslice(a, coord, edge_length=48, order='zyx'):
    '''Take a volume and return a cube of side edge_length, centered at coord. '''
    assert len(coord) == 3, 'Must be a 3d dimension array-like'
    m = edge_length // 2
    if order == 'zyx':
        z, y, x = coord
    else:
        x, y, z = coord
    return a[x-m:x+m, y-m:y+m, z-m:z+m]

def cube(a):
    '''Reshape an array into a cubic shape'''
    n = a.shape[0]
    d = np.around(n**(1/3))
    d = int(d)
    assert d**3 == n, 'Dimensions are not an even cube!'
    return a.reshape((d,d,d))

def random_subslice(a, edge_length=48, order='zyx', returnCoord=False):
    m = edge_length // 2
    T, U, V = a.shape
    t = np.random.randint(m, T-m)
    u = np.random.randint(m, U-m)
    v = np.random.randint(m, V-m)
    subvol = subslice(a, (t,u,v), edge_length=edge_length, order=order)
    if returnCoord:
        return subvol, (t,u,v)
    return subvol



def safe_random_subslice(a, coord, rad=48, edge_length=48, order='zyx', returnCoord=False):
    """Deliberately avoid a volume too close to a known coordinate (e.g. tumor)"""
    m = edge_length // 2
    T, U, V = a.shape
    t,u,v = coord # deliberately start with the loop condition
    newcoord = (t,u,v)
    while sum([(a-b)**2 for (a,b) in zip(coord, newcoord)])**0.5 < rad:
        t = np.random.randint(m, T-m)
        u = np.random.randint(m, U-m)
        v = np.random.randint(m, V-m)
        newcoord = (t,u,v)
        
    subvol = subslice(a, (t,u,v), edge_length=edge_length, order=order)
    if returnCoord:
        return subvol, (t,u,v)
    return subvol

    

In [4]:
def coord_to_ravel_idx3(shape, xyz, order='zyx'):
    '''3D specific version. Takes a coordinate (as x y z index notation) and returns the absolute (raveled) single number index
    order: {'xyz', 'zyx'}
    '''
    n0, n1, n2 = shape
    if order == 'zyx':
        z, y, x = xyz
    else:
        x, y, z = xyz
    idx = z*n2*n1 + y*n2 + x
    return idx

def coord_to_ravel_idx(shape, coord):
    '''Takes a coordinate (as x y z index notation) and returns the absolute (raveled) single number index'''

    assert len(shape) == len(coord), 'Must have matching dimension'
    N = len(shape)
    idx = coord[0]
    for i in range(1, N):
        idx += coord[i]*np.prod(shape[N-i:])
        print(i, coord[i], shape[N-i:])
    
    return idx

def ravel_idx_to_coord(shape, idx):
    '''Given a shape and the absolute index, return the x y z coordinate index'''
    N = len(shape)
    coefs = []
    coords = []
    r = idx
    for i in range(N-1, 0, -1):
        coef = shape[N-i:]
        coefs.append(coef)
        q, r = divmod(r, np.prod(coef))
        coords.append(q)
        print(q,r)
    coords.append(r)
    coords.reverse()
    
    return coefs, coords

def coord_to_subcoord(subshape, coord):
    '''Gives the sub-cube 3d index for a subsected volume'''
    new_idx = []
    new_subcoord = []
    for i in range(3):
        q, r = divmod(coord[i], subshape[i])
        new_idx.append(q)
        new_subcoord.append(r)
    return new_idx, new_subcoord


In [5]:
def look_for_mhd(luna_path, uid):
    """ Holy nasty hack, Batman!"""
    for i in range(10):
        path = luna_path + '/subset{}/'.format(i) + uid + '.mhd'
        q = os.path.exists(path)
        if q:
            return path
    raise FileNotFoundError('Cannot find file in any subset: {}/subsetX/{}'.format(luna_path, uid))

def get_tumor_volume_from_row(row, luna_path, resamp_path, edgelength=48, verbose=False):
#     row = df.iloc[idx]
    nx, ny, nz = row['coordX'], row['coordY'], row['coordZ']
    path = look_for_mhd(luna_path, row['seriesuid'])
    origin = get_origin(path)
    absidx = coord_to_ary_idx((nx, ny, nz), origin)
    vol = np.load(resamp_path + row['seriesuid'] + '.mhd.npy')
    subvol = subslice(vol, absidx, edge_length=edgelength)
    if verbose:
        print('Origin: {}'.format(origin))
        print('Abs Index: {}'.format(absidx))
        print('Vol shape: {}'.format(vol.shape))
    return subvol

def get_multi_volume_from_row(row, luna_path,  resamp_path, k=4, edgelength=48, verbose=False):
#     row = df.iloc[idx]
    nx, ny, nz = row['coordX'], row['coordY'], row['coordZ']
    path = look_for_mhd(luna_path, row['seriesuid'])
    origin = get_origin(path)
    absidx = coord_to_ary_idx((nx, ny, nz), origin)
    vol = np.load(resamp_path + row['seriesuid'] + '.mhd.npy')
    subvol = subslice(vol, absidx, edge_length=edgelength)
    if verbose:
        print('Origin: {}'.format(origin))
        print('Abs Index: {}'.format(absidx))
        print('Vol shape: {}'.format(vol.shape))
        
    negs = []
    for i in range(k):
        rv = safe_random_subslice(vol, absidx)
        negs.append(rv)
    return subvol, negs

def get_tumor_randseries_from_row(row, luna_subset_path,  resamp_path, edgelength=48, nsamp=20, ratio=0.3, verbose=False):
    """Get a bunch of frames from the tumor region"""
#     row = df.iloc[idx]
    nx, ny, nz = row['coordX'], row['coordY'], row['coordZ']
    path = look_for_mhd(luna_path, row['seriesuid'])
    origin = get_origin(path)
    absidx = coord_to_ary_idx((nx, ny, nz), origin)
    vol = np.load(resamp_path + row['seriesuid'] + '.mhd.npy')
    m = int(edgelength * ratio)
    subvols = []
    for i in range(nsamp):
        offset = np.random.randint(0, m, 3)
        subvol = subslice(vol, absidx + offset, edge_length=edgelength)
        if subvol.shape == (edgelength, edgelength, edgelength):
            subvols.append(subvol)
        if verbose:
            print('Origin: {}'.format(origin))
            print('Abs Index: {}'.format(absidx))
            print('Vol shape: {}'.format(vol.shape))
    return subvols



In [6]:
# Some constants 
drive='tris'
subfolder='bowl17'
# INPUT_FOLDER = '/media/mike/{}/data/{}/kgsamples/'.format(drive, subfolder)
# patients = os.listdir(INPUT_FOLDER)
# patients.sort()

file_list = []
luna_path =  '/media/mike/{}/data/{}/luna/'.format(drive, subfolder)
output_path = luna_path + 'output/'

for i in range(10):
    luna_subset_path = '/media/mike/{}/data/{}/luna/subset{}/'.format(drive, subfolder, i)
    files=glob.glob(luna_subset_path+"*.mhd")
    file_list += files

resamp_path = '/media/mike/{}/data/{}/resampled_images/'.format(drive, subfolder)
resamps = glob.glob(resamp_path + '*.mhd.npy')

print('# of files: ', len(file_list))
print('# of resamps:', len(resamps))

# of files:  888
# of resamps: 888


In [7]:
#
# The locations of the nodes
df_node = pd.read_csv(luna_path+"annotations.csv")
print('Number of annotations:', len(df_node))
df_node["file"] = df_node["seriesuid"].apply(get_filename)
df_node = df_node.dropna()
print('Len df_node:', len(df_node))

Number of annotations: 1186
Len df_node: 1186


In [8]:
dfs = df_node.sort_values(by='diameter_mm', ascending=0)
dfs.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,diameter_mm,file
765,1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...,67.827256,85.379925,-109.746724,32.27003,/media/mike/tris/data/bowl17/luna/subset1/1.3....
34,1.3.6.1.4.1.14519.5.2.1.6279.6001.112740418331...,47.671057,37.64252,-99.890394,30.610406,/media/mike/tris/data/bowl17/luna/subset5/1.3....
1160,1.3.6.1.4.1.14519.5.2.1.6279.6001.943403138251...,-46.949664,72.636454,-95.644521,27.442423,/media/mike/tris/data/bowl17/luna/subset2/1.3....
998,1.3.6.1.4.1.14519.5.2.1.6279.6001.481278873893...,-103.132511,-5.774673,-206.35547,27.075443,/media/mike/tris/data/bowl17/luna/subset3/1.3....
1002,1.3.6.1.4.1.14519.5.2.1.6279.6001.487268565754...,119.208776,11.450374,-165.039862,26.837081,/media/mike/tris/data/bowl17/luna/subset8/1.3....


In [10]:
K = 100
N = 100 #len(dfs)
for j in trange(0,3):
    tumor_volumes = []
    neg_volumes = []
    legend = []
    for i in trange(j*K, (j+1)*K):
        try:
            vv, negs = get_multi_volume_from_row(df_node.iloc[i], luna_path, resamp_path)
            if vv.shape == (48,48,48):
                tumor_volumes.append(vv)
                neg_volumes += negs
            else:
                print('borked')
        except NotImplementedError as err:
    #         print('{:03}:File not found'.format(i))
            print(err)
    np.save(luna_path + 'volumes_48/' + 'tumor_c_volumes{:02}'.format(j), tumor_volumes)
    np.save(luna_path + 'volumes_48/' + 'neg_volumes{:02}'.format(j), neg_volumes)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:01<01:54,  1.16s/it][A
  3%|▎         | 3/100 [00:02<01:32,  1.05it/s][A
  4%|▍         | 4/100 [00:03<01:44,  1.09s/it][A
  5%|▌         | 5/100 [00:03<01:15,  1.26it/s][A
  8%|▊         | 8/100 [00:04<01:01,  1.50it/s][A
  9%|▉         | 9/100 [00:06<01:24,  1.08it/s][A
 10%|█         | 10/100 [00:07<01:40,  1.12s/it][A
 11%|█         | 11/100 [00:09<01:47,  1.21s/it][A
 12%|█▏        | 12/100 [00:10<01:38,  1.12s/it][A
 13%|█▎        | 13/100 [00:11<01:40,  1.16s/it][A
 15%|█▌        | 15/100 [00:12<01:23,  1.02it/s][A
 16%|█▌        | 16/100 [00:14<01:41,  1.20s/it][A
 17%|█▋        | 17/100 [00:15<01:42,  1.24s/it][A
 18%|█▊        | 18/100 [00:16<01:39,  1.22s/it][A
 21%|██        | 21/100 [00:16<01:08,  1.16it/s][A
 24%|██▍       | 24/100 [00:17<00:53,  1.41it/s][A
 25%|██▌       | 25/100 [00:19<01:06,  1.13it/s][A
 26%|██▌       | 26/100 [00:19<01:02,  1.

borked


[A
 69%|██████▉   | 69/100 [00:51<00:32,  1.05s/it][A
 70%|███████   | 70/100 [00:53<00:44,  1.48s/it][A
 71%|███████   | 71/100 [00:55<00:41,  1.43s/it][A
 72%|███████▏  | 72/100 [00:56<00:39,  1.42s/it][A
 73%|███████▎  | 73/100 [00:57<00:36,  1.35s/it][A
 74%|███████▍  | 74/100 [00:58<00:25,  1.01it/s][A
 75%|███████▌  | 75/100 [00:59<00:25,  1.00s/it][A
 76%|███████▌  | 76/100 [00:59<00:18,  1.32it/s][A
 77%|███████▋  | 77/100 [01:00<00:22,  1.02it/s][A
 78%|███████▊  | 78/100 [01:01<00:21,  1.02it/s][A
 79%|███████▉  | 79/100 [01:01<00:15,  1.36it/s][A
 80%|████████  | 80/100 [01:02<00:10,  1.83it/s][A
 81%|████████  | 81/100 [01:02<00:07,  2.38it/s][A
 82%|████████▏ | 82/100 [01:02<00:05,  3.03it/s][A
 83%|████████▎ | 83/100 [01:02<00:04,  3.76it/s][A
 84%|████████▍ | 84/100 [01:03<00:07,  2.14it/s][A
 85%|████████▌ | 85/100 [01:03<00:05,  2.69it/s][A
 86%|████████▌ | 86/100 [01:04<00:09,  1.47it/s][A
 87%|████████▋ | 87/100 [01:06<00:11,  1.17it/s][A
 88%|███

borked


[A
 74%|███████▍  | 74/100 [01:03<00:24,  1.08it/s][A
 75%|███████▌  | 75/100 [01:05<00:25,  1.01s/it][A
 76%|███████▌  | 76/100 [01:05<00:17,  1.36it/s][A
 78%|███████▊  | 78/100 [01:05<00:12,  1.82it/s][A
 79%|███████▉  | 79/100 [01:05<00:08,  2.37it/s][A
 80%|████████  | 80/100 [01:05<00:06,  3.01it/s][A
 81%|████████  | 81/100 [01:06<00:10,  1.87it/s][A
 82%|████████▏ | 82/100 [01:07<00:12,  1.48it/s][A
 83%|████████▎ | 83/100 [01:09<00:16,  1.05it/s][A
 84%|████████▍ | 84/100 [01:09<00:11,  1.38it/s][A
 85%|████████▌ | 85/100 [01:10<00:12,  1.22it/s][A
 86%|████████▌ | 86/100 [01:11<00:13,  1.03it/s][A
 87%|████████▋ | 87/100 [01:13<00:14,  1.14s/it][A
 88%|████████▊ | 88/100 [01:13<00:10,  1.17it/s][A
 89%|████████▉ | 89/100 [01:13<00:07,  1.55it/s][A
 90%|█████████ | 90/100 [01:13<00:04,  2.01it/s][A
 91%|█████████ | 91/100 [01:14<00:05,  1.52it/s][A
 92%|█████████▏| 92/100 [01:15<00:03,  2.01it/s][A
 94%|█████████▍| 94/100 [01:15<00:02,  2.66it/s][A
 96%|███

In [None]:
tumor_volumes = np.array(tumor_volumes)
print(tumor_volumes.shape)
    
    

In [None]:


K = 100
for j in trange(1,3):
    neg_volumes = [] # memory cleanup
    tumor_volumes = []
    for i in trange(j*K, (j+1)*K):
        try:
            vv = get_tumor_randseries_from_row(dfs.iloc[i], luna_path, resamp_path)
    #     if vv.shape == (48,48,48):
            tumor_volumes += vv
        except Exception as err:
            print(err)
    #     else:
    #         print('borked')
    np.save(luna_path + 'volumes_48/' + 'tumor_p_volumes{:02}'.format(j), tumor_volumes)

In [None]:
len(tumor_volumes)

In [None]:
plt.imshow(tumor_volumes[20][24])

In [None]:
tumor_volumes = np.load('/media/mike/tris/data/bowl17/luna/volumes_48/tumor_p_volumes01.npy')
type(tumor_volumes), len(tumor_volumes)

In [None]:
tumor_volumes.shape

In [None]:
j = 0
# np.save(luna_path + 'volumes_48/' + 'tumor_p_volumes{:02}'.format(j), tumor_volumes)

In [None]:
tumor_volumes = []

In [None]:
resamp_path

In [None]:
len(tumor_volumes), tumor_volumes[0].shape

In [None]:
tumor_volumes[0].dtype

In [None]:
tumor_volumes = np.array(tumor_volumes, dtype='int16')

In [None]:
tumor_volumes.shape