In [1]:
%matplotlib inline
# magic function to set up inline plotting

from __future__ import print_function, division

import numpy as np # General number processing
import pandas as pd # This will handle our CSV file I/O and sorting
import dicom
import os # operating system library
import glob # for grabbing file paths
import scipy.ndimage # for resampling
import matplotlib.pyplot as plt # plotting
import ipyvolume # 3d plotting


import SimpleITK as sitk

In [9]:
# IO helper functions
def gen_get_filename(file_list):
    # I based this off of another Kaggle kernel which used global to pass the file_list. I don't like globals, so we are using a closure to craft a custom function. 
    # For those of you not familiar with closures, they are simpler than they sound. Put simply: Closures are function objects which "remember" values.
    # This could be considered a specific case of the factory pattern. 
    def get_filename(uid):
        """Return the absolute path to a file, given a patient UID"""
        for f in file_list:
            if uid in f:
                return(f)
        return 'not_found'
    return get_filename

# mhd_file_list

def get_filename(uid, file_list=None):
    """Return the absolute path to a file, given a patient UID"""
    if file_list is None:
        raise ValueError('You must specify a list of file paths as a keyword argument')
    for f in file_list:
        if uid in f:
            return(f)
    return None #'not_found'


In [10]:
# Configure our paths for loading our files and load in some data
USER='mike'
INPUT_FOLDER = '/media/{}/tera/data/databowl/kgsamples/'.format(USER)
luna_path =  '/media/{}/tera/data/databowl/luna/'.format(USER)
luna_subset_path = '/media/{}/tera/data/databowl/luna/subset0/'.format(USER)
output_path = luna_path + 'output/'
mhd_file_list=glob.glob(luna_subset_path+"*.mhd")

#Generate our filename-getter
get_filename_mhd = gen_get_filename(mhd_file_list)

In [11]:
resamp_path = '/media/{}/tera/data/databowl/resampled_images/'.format(USER)
resamps = glob.glob(resamp_path + '*.mhd.npy')
print(len(resamps))

89


In [12]:
patients = os.listdir(INPUT_FOLDER)
patients.sort()

In [13]:
# Get the locations of the nodes of the LUNA16 dataset. Connect the filepath with the series UID
df_node = pd.read_csv(luna_path+"annotations.csv")
df_node["file"] = df_node["seriesuid"].apply(get_filename, file_list=mhd_file_list)
df_node = df_node.dropna()

In [14]:


#####################
#
# Helper function to get rows in data frame associated 
# with each file
def get_origin(filepath):
    itk_img = sitk.ReadImage(filepath)
    origin = np.array(itk_img.GetOrigin())
    return origin

def coord_to_ary_idx(coord, origin):
    coord = np.array(coord)
    origin = np.array(origin)
    x, y, z = coord - origin
    absidx = x, y, z # i have no idea why these things use such crazy indexing. but this will match the numpy slicing dims
    print(absidx)
    return list(map(int, absidx))

def get_fiducial_slice(coord, edgelen=48):
    ''' '''
    x, y, z = map(int, coord)
    m = edgelen // 2
    print(x+m, x-m, y+m, y-m, z+m, z-m)
    return (x-m, x+m, y-m, y+m, z-m, z+m)
    
def draw_fiducial_cube(ary_shape, coord, edgelen=48, dtype='int16'):
    
    ary = np.ones(ary_shape, dtype=dtype)
    x0, x1, y0, y1, z0, z1 = get_fiducial_slice(coord, edgelen=edgelen)
    ary[:z0] = 0
    ary[z1:] = 0
    ary[:,:y0] = 0
    ary[:,y1:] = 0
    ary[:,:,:x0] = 0
    ary[:,:,x1:] = 0
    print(np.sum(ary))
    return ary
 

        
# def strip_uid(path):
#     fname = os.path.basename(path)
#     return fname.strip('.mhd.npy')
# #


In [15]:
# For dramatic effect, we will look at the largest nodes first
dfs = df_node.sort_values(by='diameter_mm', ascending=0)
dfs.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,diameter_mm,file
1011,1.3.6.1.4.1.14519.5.2.1.6279.6001.511347030803...,60.775061,74.12397,-214.782347,25.233202,/media/mike/tera/data/databowl/luna/subset0/1....
1141,1.3.6.1.4.1.14519.5.2.1.6279.6001.905371958588...,109.116637,48.589511,-120.892058,21.583112,/media/mike/tera/data/databowl/luna/subset0/1....
1084,1.3.6.1.4.1.14519.5.2.1.6279.6001.752756872840...,56.393154,67.680087,-64.674453,19.653877,/media/mike/tera/data/databowl/luna/subset0/1....
420,1.3.6.1.4.1.14519.5.2.1.6279.6001.202811684116...,-83.158083,-21.678997,-97.004376,18.783233,/media/mike/tera/data/databowl/luna/subset0/1....
336,1.3.6.1.4.1.14519.5.2.1.6279.6001.187451715205...,94.635117,-17.372059,-204.396127,17.753232,/media/mike/tera/data/databowl/luna/subset0/1....


In [None]:
print(len(df_node))


In [None]:
# Not needed?
# uids = [strip_uid(path) for path in resamps]
# sum([uid in uids for uid in dfs['seriesuid'].values]) # check if the data is there
# sum([dfs['seriesuid'].values[0] in path for path in resamps])

In [None]:
biggest = dfs.iloc[3]

In [None]:
# mask = np.ones(pix_resampled.shape)
# mz, my, mx = pix_resampled.shape
# print(mz, my, mx)
# mask[140:] = 0
# mask[:110] = 0
# mask[:,:50] = 0
# mask[:,my-50:] = 0
# mask[:,:,mx-50:] = 0
# mask[:,:,:50] = 0
# mask2 = np.less(pix_resampled,1267)
# mask3 = np.greater(pix_resampled,400)

# ipyvolume.quickvolshow(pix_resampled*mask*mask3, width=1000, height=1000, level=[.26, .3, .85], level_width=[.5, .1, .05])

In [None]:
# Grab the resampled numpy array
ary = np.load(resamp_path + biggest['seriesuid'] + '.mhd.npy')
print(ary.shape, np.amin(ary), np.amax(ary), np.mean(ary))

In [None]:
biggest

In [None]:
nodeXYZ = biggest['coordX'], biggest['coordY'], biggest['coordZ']
nodeXYZ

In [None]:
origin = get_origin(luna_subset_path + biggest['seriesuid'] + '.mhd')
origin

In [None]:
# ary0 = ary - np.amin(ary) + 1

In [None]:
# ary2 = ary * (ary < 4000)

In [None]:
# mask = ary < -2000
# mask = mask * 1800
# ary2 = ary + mask
# mask.shape, np.mean(mask)

In [None]:
# ary2.shape, np.mean(ary2)

In [None]:
plt.hist(ary.flatten(), bins=80, color='c')
plt.xlabel("Pseudo-Hounsfield Units (HU)")
plt.ylabel("Frequency")
plt.show()

In [None]:
img = np.array(ary2[180])
img[:20] = 0 # [z0:z1, y1:y0, x1:x0]
plt.imshow(img)
plt.colorbar()

In [None]:
plt.plot(ary2[50][100])

In [None]:
absidx = coord_to_ary_idx(nodeXYZ, origin)
absidx

In [None]:
# fid = draw_fiducial_cube(ary.shape, nodeXYZ, 20)
# nporig = draw_fiducial_cube(ary.shape, (25,25,25))
tumor = draw_fiducial_cube(ary.shape, absidx, 30)

In [None]:
TUMOR_BOOST=5000
vol = np.array(ary0, dtype=np.int16) 
vol += TUMOR_BOOST*tumor
# vol = np.array(vol, dtype=np.int16)
print(type(vol))
print(vol.shape, vol.dtype)

In [None]:
plt.figure(figsize=(8,8))
plt.imshow(vol[absidx[2]])

In [None]:
assert 0

In [None]:
ipyvolume.quickvolshow(-vol[50:-50], width=1000, height=1000, level=[.12, .41, .57], opacity=[.01, .02, .01], level_width=[.05, .1, .05])

In [None]:
ary[50][0][0]

In [None]:
assert 0

In [None]:
ipyvolume.quickvolshow(ary, width=1000, height=1000, level=[.12, .41, .57], opacity=[.01, .02, .01], level_width=[.5, .1, .05])

In [None]:
mask = np.ones(ary.shape)
mask[180:] = 0
ipyvolume.quickvolshow(ary0*mask, width=1000, height=1000, level=[.12, .41, .57], opacity=[.01, .02, .01], level_width=[.5, .1, .05])

In [None]:
np.amin(ary), np.amax(ary)