# Preprocessing Again

Preprocess UVMMC nifti data.  Store it on disk.  This saves a lot of time reading and preprocessing the images and speeds up training (a lot).  Preprocessing resizes every voxel to be 1mm^3.  Every scan is resized accordingly.  Every voxel is clipped to -1000 to 1000 hounsfield units and then normalized to lie between 0 and 1.  Scans are much smaller and load much faster from disk.  Metadata about the scans, like their new voxel dimensions or the path of the preprocessed image, is stored in a pickle somewhere that you can read and write.  This metadata is used for data 

## Imports and Constants, etc.

In [None]:
import datetime
import importlib
import keras
from keras.layers import (Dense, SimpleRNN, Input, Conv1D, 
                          LSTM, GRU, AveragePooling3D, Conv3D, 
                          UpSampling3D, BatchNormalization)
from keras.models import Model
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import projd
import random
import re
import scipy
import shutil
import sys
from sklearn.model_selection import train_test_split
import uuid

import matplotlib.pyplot as plt # data viz
import seaborn as sns # data viz

import imageio # display animated volumes
from IPython.display import Image # display animated volumes

from IPython.display import SVG # visualize model
from keras.utils.vis_utils import model_to_dot # visualize model

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import util
importlib.reload(util)


SEED = 0
EPOCHS = 10
BATCH_SIZE = 1
PATCH_SHAPE = (32, 32, 32)

MODEL_NAME = 'model_01'

DATA_DIR = Path('/data2').expanduser()
NORMAL_SCANS_DIR = DATA_DIR / 'uvmmc/nifti_normals'
FRACTURE_SCANS_DIR = DATA_DIR / 'uvmmc/nifti_fractures'
PROJECT_DATA_DIR = DATA_DIR / 'uvm_deep_learning_project'
PP_IMG_DIR = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed' # preprocessed scans dir
PP_MD_PATH = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed_metadata.pkl'

MODELS_DIR = PROJECT_DATA_DIR / 'models'
LOG_DIR = PROJECT_DATA_DIR / 'log'
TENSORBOARD_LOG_DIR = PROJECT_DATA_DIR / 'tensorboard'
TMP_DIR = DATA_DIR / 'tmp'

for d in [DATA_DIR, NORMAL_SCANS_DIR, PROJECT_DATA_DIR, PP_IMG_DIR, MODELS_DIR, LOG_DIR, 
          TENSORBOARD_LOG_DIR, TMP_DIR, PP_MD_PATH.parent]:
    if not d.exists():
        d.mkdir(parents=True)
        
%matplotlib inline
sns.set()


## Data Preprocessing

1. Read in the original images.
2. Resample the image s.t the voxel size is 1mm x 1mm x 1mm.
3. Clip houndsfield unit values and normalize them to be between 0 and 1. (like Julian de Wit recommends for Kaggle)

Some processing code is from https://github.com/juliandewit/kaggle_ndsb2017/blob/master/step1_preprocess_luna16.py.


In [None]:


def get_image(path):
    # read the image from the filesystem
    img = nib.load(path).get_data()
    return img
   
    
def get_preprocessed_image(path):
    return np.load(path)
    

def resample_image(image, spacing, new_spacing):
    '''
    image: a 3d volume
    spacing: the size of a voxel in some units.  E.g. [0.3, 0.3, 0.9]
    new_spacing: the size of a voxel after resampling, in some units.  E.g. [1.0, 1.0, 1.0]
    
    returns: resampled image and new spacing adjusted because images have integer dimensions.
    '''
    # calculate resize factor required to change image to new shape
    spacing = np.array(spacing)
    new_spacing = np.array(new_spacing)
    spacing_resize_factor = spacing / new_spacing
    new_real_shape = image.shape * spacing_resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    
    # adjusted spacing to account for integer dimensions of resized image.
    new_spacing = spacing / real_resize_factor
    
    new_image = scipy.ndimage.interpolation.zoom(image, real_resize_factor)
    return new_image, new_spacing


def normalize_nifti_image(image):
    '''
    Normalize voxel units by clipping them to lie between -1000 and 1000 hounsfield units 
    and then scale number to between 0 and 1.
    '''
    MIN_BOUND = -1000.0 # Air: -1000, Water: 0 hounsfield units.
    MAX_BOUND = 1000.0 # Bone: 200, 700, 3000.  https://en.wikipedia.org/wiki/Hounsfield_scale
    image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
    image[image > 1] = 1.
    image[image < 0] = 0.
    return image


def get_preprocessed_image_path(scan_id, preprocessed_dir):
    return str(Path(preprocessed_dir, f'{scan_id}.npy'))


def preprocess_nifti_scans(normals_dir, fractures_dir, dest_dir, 
                             metadata_path, delete_existing=False):
    
    if delete_existing and dest_dir.isdir():
        print('Removing existing dest dir:', dest_dir)
        shutil.rmtree(dest_dir)
    if not dest_dir.exists():
        print('Making preprocessed images destination:', dest_dir)
        dest_dir.mkdir(parents=True)
    
    # get all scan infos
    normal_infos = get_data_infos(get_nifti_files(normals_dir))
    fracture_infos = get_data_infos(get_nifti_files(fractures_dir))

    # add class label
    normal_infos['class'] = 'normal'
    fracture_infos['class'] = 'fracture'
    
    # process and save each image.
    infos = pd.concat([normal_infos, fracture_infos]).reset_index(drop=True) # index from 0:len(infos)

    for i in range(len(infos)):
        print('image index:', i)
        info = infos.loc[i, :]
        img_path = str(info['path'])
        print('image path:', img_path)
        scan_id = info['id']
        print('image id:', scan_id)
        img = get_image(img_path)
        print('image shape:', img.shape)
        
        # Standardize voxel size to 1mm^3 to reduce image size.
        spacing = (info['pixdim0'], info['pixdim1'], info['pixdim2'])
        target_spacing = (1.0, 1.0, 1.0)
        print('image spacing:', spacing)
        print('new spacing:', target_spacing)
        resampled_img, resampled_spacing = resample_image(img, spacing, target_spacing)
        print('resampled image spacing:', resampled_spacing)
        print('resampled image shape:', resampled_img.shape)
        
        normalized_img = normalize_image(resampled_img)
        print('Normalized image shape:', normalized_img.shape)
        
        # save processed image
        path = get_preprocessed_image_path(scan_id, dest_dir)
        print(f'Saving preprocessed image to {path}.')
        np.save(path, normalized_img)
        
        # track image metadata
        infos.loc[i, 'pp_path'] = str(path)
        # voxel dimensions
        infos.loc[i, 'pp_pixdim0'] = resampled_spacing[0] 
        infos.loc[i, 'pp_pixdim1'] = resampled_spacing[1] 
        infos.loc[i, 'pp_pixdim2'] = resampled_spacing[2] 
        # image dimensions
        infos.loc[i, 'pp_dim0'] = resampled_img.shape[0]
        infos.loc[i, 'pp_dim1'] = resampled_img.shape[1]
        infos.loc[i, 'pp_dim2'] = resampled_img.shape[2]
        

    # save metadata
    write_preprocessed_nifti_metadata(infos, path=metadata_path)
    return infos
    
        
def write_preprocessed_nifti_metadata(infos, path):
    print('saving preproccessed metadata to', path)
    with open(path, 'wb') as fh:
        fh.write(pickle.dumps(infos))
    
    
def read_preprocessed_nifti_metadata(path):
    print('reading preproccessed metadata from', path)
    with open(path, 'rb') as fh:
        infos = pickle.loads(fh.read())
    
    return infos



### Preprocess Images and Save to Disk

In [None]:
# Uncomment to preprocess images
# infos = preprocess_nifti_scans(NORMAL_SCANS_DIR, FRACTURE_SCANS_DIR, dest_dir=PP_IMG_DIR, metadata_path=PP_MD_PATH)

### Testing and Validating Functions

In [None]:
# Test getting a raw image
data_infos = infos
img_info = data_infos.iloc[0]
img = get_image(img_info['path'])

In [None]:
animate_crop(img, axis=2, step=5)

In [None]:
# Test that the resampled image has more or less the shape we expect it to have after resizing the voxels.

img_spacing = (img_info['pixdim0'], img_info['pixdim1'], img_info['pixdim2'])
print('Shape and spacing before resampleing\t', img.shape, img_spacing)
target_img_spacing = (1., 1., 1.)
print('Target spacing:', target_img_spacing)
resampled_img, resampled_spacing = resample_image(img, img_spacing, target_img_spacing)
print ("Shape after resampling\t", resampled_img.shape, resampled_spacing)
animate_crop(resampled_img)


In [None]:
# Test reading metadata, which contains the preprocessed image spacings and file paths
infos = read_preprocessed_metadata(PP_MD_PATH)
pp_spacings = list(zip(infos['pp_pixdim0'], infos['pp_pixdim1'], infos['pp_pixdim2']))
pp_paths = list(infos['pp_path'])

In [None]:
pp_spacings[:5]

In [None]:
pp_paths[:5]

In [None]:
# Test that preprocessed images look reasonable when visualized
for i in range(3):
    img = get_preprocessed_image(infos.loc[i, 'pp_path'])
    scan_id = infos.loc[i, 'id']
    print(f'image {i} scan id {scan_id} shape {img.shape}')
    display(animate_crop(img))