# Preprocessing xVertSeg



## Imports and Constants, etc.

In [None]:
import datetime
import importlib
import keras
from keras.layers import (Dense, SimpleRNN, Input, Conv1D, 
                          LSTM, GRU, AveragePooling3D, MaxPooling3D, GlobalMaxPooling3D,
                          Conv3D, UpSampling3D, BatchNormalization, Concatenate, Add)
from keras.models import Model
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import projd
import random
import re
import scipy
import shutil
import SimpleITK # xvertseg MetaImage files
import sys
from sklearn.model_selection import train_test_split
import uuid

import matplotlib.pyplot as plt # data viz
import seaborn as sns # data viz

import imageio # display animated volumes
from IPython.display import Image # display animated volumes

from IPython.display import SVG # visualize model
from keras.utils.vis_utils import model_to_dot # visualize model

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import util
import preprocessing
import datagen
import modelutil
import xvertseg


SEED = 0
EPOCHS = 100
BATCH_SIZE = 1
PATCH_SHAPE = (32, 32, 32)

MODEL_NAME = 'model_10'

DATA_DIR = Path('/data2').expanduser()
# UVMMC
NORMAL_SCANS_DIR = DATA_DIR / 'uvmmc/nifti_normals'
PROJECT_DATA_DIR = DATA_DIR / 'uvm_deep_learning_project'
PP_IMG_DIR = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed' # preprocessed scans dir
PP_MD_PATH = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed_metadata.pkl'
# xVertSeg
XVERTSEG_DIR = DATA_DIR / 'xVertSeg.v1'
PP_XVERTSEG_DIR = PROJECT_DATA_DIR / 'xVertSeg.v1' / 'preprocessed' # preprocessed scans dir
PP_XVERTSEG_MD_PATH = PROJECT_DATA_DIR / 'xVertSeg.v1' / 'preprocessed_metadata.pkl'


MODELS_DIR = PROJECT_DATA_DIR / 'models'
LOG_DIR = PROJECT_DATA_DIR / 'log'
TENSORBOARD_LOG_DIR = PROJECT_DATA_DIR / 'tensorboard' / MODEL_NAME
TMP_DIR = DATA_DIR / 'tmp'

for d in [DATA_DIR, NORMAL_SCANS_DIR, PROJECT_DATA_DIR, PP_IMG_DIR, MODELS_DIR, LOG_DIR, 
          TENSORBOARD_LOG_DIR, TMP_DIR, PP_MD_PATH.parent, PP_XVERTSEG_DIR, PP_XVERTSEG_MD_PATH.parent]:
    if not d.exists():
        d.mkdir(parents=True)
        
%matplotlib inline
sns.set()

# I love u autoreload!
%load_ext autoreload
%autoreload 2

## Read Data

In [None]:
infos = xvertseg.get_xvertseg_infos(XVERTSEG_DIR)

In [None]:
img, itk = xvertseg.load_xvertseg_img(infos.loc[0, 'image_mhd'])

In [None]:
mask, mitk = xvertseg.load_xvertseg_img(infos.loc[0, 'mask_mhd'])

## Normalization

In [None]:
def normalize_xvertseg_image(image):
    '''
    img: an xvertseg xyz oriented image.  These images look like they have hounsfield units shifted by +1000 so
    that they will be non-negative numbers.  Anyway...
    
    Normalize each voxel in img by clipping values to lie within 0 to 2000.  
    Scale the numbers to between 0 and 1.
    
    return: normalized image.
    '''
    MIN_BOUND = 0000.0 # Air: -1000, Water: 0 hounsfield units.
    MAX_BOUND = 2000.0 # Bone: 200, 700, 3000.  https://en.wikipedia.org/wiki/Hounsfield_scale
    image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
    image[image > 1] = 1.
    image[image < 0] = 0.
    return image


def plot_image_historgrams():
    infos = get_xvertseg_infos(XVERTSEG_DIR)
    for i in range(len(infos)):
        img_zyx, itk = load_xvertseg_img(infos.loc[i, 'image_mhd'])
        img = np.swapaxes(img_zyx, 0, 2) # swap z and x.
        plt.hist(img.ravel(), 256)
        plt.title('image histogram for id ' + str(infos.loc[i, 'id']))
        plt.show()
        


In [None]:
# plot a histogram of voxel values.  Does it look like hounsfield units.  Yes, but shifted.
# infos = xvertseg.get_xvertseg_infos(PP_XVERTSEG_MD_PATH)
infos = xvertseg.read_xvertseg_metadata(PP_XVERTSEG_MD_PATH)
plt.hist(img.ravel())
plt.title('infos' + str(infos.loc[0, 'id']))
plt.show()


In [None]:
# Check the other images.
# This takes a few minutes.
xvertseg.plot_image_historgrams()


In [None]:
# plot a histogram of voxel values.  Does it look like hounsfield units.  Yes, but shifted.
plt.hist(normalize_xvertseg_image(img).ravel(), bins=256)
plt.title('infos' + str(infos.loc[0, 'id']))
plt.show()


## Resample Image



In [None]:
# https://github.com/juliandewit/kaggle_ndsb2017/blob/master/step1_preprocess_luna16.py


def get_preprocessed_xvertseg_image_path(id, preprocessed_dir):
    return str(Path(preprocessed_dir, f'image{id:03}.npy'))


def get_preprocessed_xvertseg_binary_mask_path(id, preprocessed_dir):
    return str(Path(preprocessed_dir, f'binmask{id:03}.npy'))


def get_preprocessed_xvertseg_categorical_mask_path(id, preprocessed_dir):
    return str(Path(preprocessed_dir, f'catmask{id:03}.npy'))


def resample_xvertseg_image(img, spacing, target_spacing, metadata_only=False):

    print('img shape:', img.shape)
    print('img spacing:', spacing)
    print('target spacing:', target_spacing)
    # resample image
    resampled_img, resampled_spacing = preprocessing.resample_image(img, spacing, target_spacing,
                                                                    metadata_only=metadata_only)
    print('resampled image spacing:', resampled_spacing)
    print('resampled image shape:', resampled_img.shape)
    return resampled_img, resampled_spacing



In [None]:
infos = xvertseg.get_xvertseg_infos(XVERTSEG_DIR)
img, itk = xvertseg.load_xvertseg_img(infos.loc[0, 'image_mhd'])
spacing = np.array(itk.GetSpacing())    # spacing of voxels in world coor. (mm)
target_spacing = (1., 1., 1.)
resampled_img, resampled_spacing = resample_xvertseg_image(img, spacing, target_spacing=target_spacing)

In [None]:
print('resampled_spacing:', resampled_spacing)
print('resampled_img.shape:', resampled_img.shape)

In [None]:
util.animate_crop(resampled_img, step=50)

## Resample Mask

Masks have 6 classes embedded in their values.  To be resampled, they need to be split into binary masks, resized, re-binarized, and recombined?

In [None]:
# https://github.com/juliandewit/kaggle_ndsb2017/blob/master/step1_preprocess_luna16.py

XVERTSEG_MASK_VALS = (200, 210, 220, 230, 240)

def xvertseg_mask_layers_gen(mask, vals=XVERTSEG_MASK_VALS):
    '''
    Avoid having every mask layer generated at the same time, to save on memory.
    '''
    for val in vals:
        layer = np.zeros(mask.shape)
        layer[mask == val] = 1.
        yield layer, val

In [None]:
mimg, mitk = xvertseg.load_xvertseg_img(infos.loc[0, 'mask_mhd'])
spacing = np.array(mitk.GetSpacing())    # spacing of voxels in world coor. (mm)
for layer, val in xvertseg_mask_layers_gen(mimg):
    print('val:', val)
    plt.hist(layer.ravel(), bins=256)
    plt.show()
    print('resampling...')
    rlayer, rspacing = preprocessing.resample_image(layer, spacing, target_spacing)
    print('resampled spacing:', rspacing)
    plt.hist(rlayer.ravel(), bins=256)
    plt.show()
    lu, lcounts = np.unique(layer.ravel(), return_counts=True)
    print('layer unique vals:', lu)
    print('layer unique counts:', lcounts)
    rlu, rlcounts = np.unique(rlayer.ravel(), return_counts=True)
    print('rlayer unique vals:', rlu)
    print('rlayer unique counts:', rlcounts)
    
    # display(util.animate_crop(layer))

In [None]:
def binarize_mask(mask, p=0.5):
    '''
    mask: an array whose value will be thresholded by p.  >p -> 1.  <=p -> 0
    '''
    mask[mask > p] = 1
    mask[mask <= p] = 0
    return mask

    
def resample_xvetseg_mask_layer(layer, spacing, target_spacing, metadata_only=False, p=0.5):
    '''
    p: Binarization threshold.  Everything greater than this theshold is set to 1.  
      Everything less than or equal to p is set to 0.
    '''
    print('resampling...')
    resampled_layer, resampled_spacing = preprocessing.resample_image(
        layer, spacing, target_spacing, metadata_only=metadata_only)
    print('resampled spacing:', resampled_spacing)
    
    resampled_layer = binarize_mask(resampled_layer, p=p)
    return resampled_layer, resampled_spacing


In [None]:
# resampling makes the masks a little bit (0.0003% more of image is black) darker. 
for layer, val in xvertseg_mask_layers_gen(mimg):
    print('layer.shape:', layer.shape)
    print('layer val:', val)
    num_voxels = np.product(layer.shape)
    print('num_voxels:', num_voxels)
    print('0.0%:', (1 - np.sum(layer) / num_voxels) * 100)
    pct1 = np.sum(layer) / num_voxels * 100
    print('1.0%:', pct1)
    print('spacing:', spacing)
    rlayer, rspacing = resample_xvetseg_mask_layer(layer, spacing, target_spacing)
    print('rlayer.shape:', rlayer.shape)
    rlu, rlcounts = np.unique(rlayer.ravel(), return_counts=True)
    resampled_num_voxels = np.product(rlayer.shape)
    print('resampled_num_voxels:', resampled_num_voxels)
    print('0.0%:', (1 - np.sum(rlayer) / resampled_num_voxels) * 100)
    rpct1 = (np.sum(rlayer) / resampled_num_voxels) * 100
    print('1.0%:', rpct1)
    # a gross measure of accuracy for the resampling and rebinarization process.
    print('Number of 1.0/true voxels more (or less) than expected from the original 1.0%:')
    print((rpct1 - pct1) * resampled_num_voxels)
    print('rlayer unique vals:', rlu)
    print('rlayer unique counts:', rlcounts)
    display(util.animate_crop(rlayer, axis=0, step=10))


In [None]:
def resample_xvertseg_mask(img, spacing, target_spacing, metadata_only=False, bin_thresh=0.5):
    '''
    vVertSeg image masks have 6 classes embedded in their values. 
    To be resampled, they need to be split into binary masks, resized, re-binarized, and recombined.
    
    image: a 3d volume that is an image mask. 
    spacing: the size of a voxel in some units.  E.g. [0.3, 0.3, 0.9]
    target_spacing: the size of a voxel after resampling, in some units.  E.g. [1.0, 1.0, 1.0]
    bin_thresh: binarization threshold.  Used to clean up mask after resampling.
    
    returns: resampled categorical and binary masks with target spacing adjusted because volumes have 
      integer dimensions.
    '''
    resampled_spacing = None
    resampled_binary_mask = None
    resampled_categorical_mask = None
    # split the image into layers, one layer for each category (except background category).
    for layer, val in xvertseg_mask_layers_gen(mimg):
        print('resampling mask layer for val:', val)
        # rlayer is a binary mask, resampled from layer.
        rlayer, rspacing = resample_xvetseg_mask_layer(layer, spacing, target_spacing, p=bin_thresh)
        
        if resampled_spacing is None:
            resampled_spacing = rspacing
            print('resampled_spacing:', resampled_spacing)
            resampled_binary_mask = np.zeros(rlayer.shape)
            resampled_categorical_mask = np.zeros(rlayer.shape)
            
        if np.any(resampled_spacing != rspacing):
            raise Exception('Resampled spacing did not match previous resampled spacing!', resampled_spacing, rspacing)
            
        # where rlayer and running mask both have data, ignore rlayer data (someone got there first).
        rlayer[(resampled_binary_mask > 0) & (rlayer > 0)] = 0 
        resampled_binary_mask = np.add(resampled_binary_mask, rlayer)
        resampled_categorical_mask = np.add(resampled_categorical_mask, rlayer * val)
        
    resampled_binary_mask = binarize_mask(resampled_binary_mask, p=bin_thresh)
    
    return resampled_binary_mask, resampled_categorical_mask, resampled_spacing

In [None]:
rbmask, rcmask, rspacing = resample_xvertseg_mask(mimg, spacing, target_spacing)

In [None]:
util.animate_crop(rbmask, axis=0, step=10)

In [None]:
util.animate_crop(rcmask, axis=0, step=10)

## Preprocess Images and Masks

This takes *forever*!

Each xVertSeg scan has its image resampled, normalized and saved.  Any mask is resampled and saved as a categorical mask, like the original.

In [None]:
# uncomment to preprocess xvertseg.
# delete_existing: True to remove existing preprocessed images.
# metadata_only: True to not generate any images and only read the original image.  Faster.  
# bin_thresh: binary thresold for rebinarizing resampled binary masks.
#   Only updates metadata, not images.
# infos = xvertseg.preprocess_xvertseg(XVERTSEG_DIR, PP_XVERTSEG_DIR, PP_XVERTSEG_MD_PATH, start=0, metadata_only=False, 
#                    delete_existing=True, bin_thresh=0.5)


In [None]:
infos = xvertseg.read_xvertseg_metadata(PP_XVERTSEG_MD_PATH)
infos.head()

In [None]:
# resampled size for image006.
r = 296, 296, 167
# original size
o = 1024, 1024, 100
rv = np.product(r)
ov = np.product(o)
prop = rv/ov
print('original shape:', o)
print('resampled shape:', r)
print('original volume:', ov)
print('resampled volume:', rv)
print('resampled proportion of orginal volume:', rv/ov)


In [None]:
img = preprocessing.get_preprocessed_image(
    '/data2/uvm_deep_learning_project/xVertSeg.v1/preprocessed/image015.npy')
util.animate_crop(img, axis=0, step=10)

In [None]:
util.animate_crop(preprocessing.get_preprocessed_image(
    '/data2/uvm_deep_learning_project/xVertSeg.v1/preprocessed/mask015.npy'), axis=0, step=10)

In [None]:
infos = xvertseg.read_xvertseg_metadata(PP_XVERTSEG_MD_PATH)

In [None]:
infos.head()