# Reading xVertSeg

First contact with the data in code.  Read the files.  Create metadata about xvertseg scans.


## Imports and Constants, etc.

In [None]:
import datetime
import importlib
import keras
from keras.layers import (Dense, SimpleRNN, Input, Conv1D, 
                          LSTM, GRU, AveragePooling3D, MaxPooling3D, GlobalMaxPooling3D,
                          Conv3D, UpSampling3D, BatchNormalization, Concatenate, Add)
from keras.models import Model
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import projd
import random
import re
import scipy
import shutil
import SimpleITK
import sys
from sklearn.model_selection import train_test_split
import uuid

import matplotlib.pyplot as plt # data viz
import seaborn as sns # data viz

import imageio # display animated volumes
from IPython.display import Image # display animated volumes

from IPython.display import SVG # visualize model
from keras.utils.vis_utils import model_to_dot # visualize model

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import util
importlib.reload(util)
import preprocessing
importlib.reload(preprocessing)
import datagen
importlib.reload(datagen)
import modelutil
importlib.reload(modelutil)

SEED = 0
EPOCHS = 100
BATCH_SIZE = 1
PATCH_SHAPE = (32, 32, 32)

MODEL_NAME = 'model_09'

DATA_DIR = Path('/data2').expanduser()
NORMAL_SCANS_DIR = DATA_DIR / 'uvmmc/nifti_normals'
PROJECT_DATA_DIR = DATA_DIR / 'uvm_deep_learning_project'
PP_IMG_DIR = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed' # preprocessed scans dir
PP_MD_PATH = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed_metadata.pkl'

PP_XVERTSEG_IMG_DIR = PROJECT_DATA_DIR / 'xVertSeg.v1' / 'preprocessed' # preprocessed scans dir
PP_XVERTSEG_PATH = PROJECT_DATA_DIR / 'xVertSeg.v1' / 'preprocessed_metadata.pkl'


MODELS_DIR = PROJECT_DATA_DIR / 'models'
LOG_DIR = PROJECT_DATA_DIR / 'log'
TENSORBOARD_LOG_DIR = PROJECT_DATA_DIR / 'tensorboard' / MODEL_NAME
TMP_DIR = DATA_DIR / 'tmp'

for d in [DATA_DIR, NORMAL_SCANS_DIR, PROJECT_DATA_DIR, PP_IMG_DIR, MODELS_DIR, LOG_DIR, 
          TENSORBOARD_LOG_DIR, TMP_DIR, PP_MD_PATH.parent, PP_XVERTSEG_IMG_DIR, PP_XVERTSEG_PATH.parent]:
    if not d.exists():
        d.mkdir(parents=True)
        
%matplotlib inline
sns.set()

%load_ext autoreload
%autoreload 2

## Read Data

In [None]:
XVERTSEG_DIR = DATA_DIR / 'xVertSeg.v1'

def get_mhd_raw_id(d):
    '''
    d: a Path, a directory containing paired MetaImage format files.
    returns a triple of a list of mhd files, of raw files, and of xvertseg scan ids.
    '''
        
    mhds = [str(p) for p in list(d.glob('*.mhd'))]
    ids = [int(re.search(r'.*?(\d\d\d)\.mhd$', p).group(1)) for p in mhds]
    raws = [d / re.sub(r'\.mhd$', '.raw', p) for p in mhds]
    return mhds, raws, ids


def get_xvertseg_infos(xvertseg_dir):
    '''
    Build a dataframe with columns: id, dataset, image_mhd, image_raw, mask_mhd, mask_raw, and labeled.
    id is the number embedded in the xvertseg filenames.  xvertseg is split into 2 datasets, data1 and data2.
    data1 is labeled, meaning it has segmentation masks.  data2 only has images.
    
    There are 15 labeled images and 10 unlabeled images.
    
    data_dir: the xVertSeg1.v1/Data1 dir, as a Path.
    return: dataframe. 
    '''
    # filename examples
    # image016.mhd
    # image016.raw
    # mask001.mhd
    # mask001.raw
    
    # Data1 has 15 images and masks (labeled data)
    # Data2 has 10 test images with no mask.  Unlabeled data.
    data1_dir = xvertseg_dir / 'Data1'
    idir1 = data1_dir / 'images'
    mdir1 = data1_dir / 'masks'
    data2_dir = xvertseg_dir / 'Data2'
    idir2 = data2_dir / 'images'
    
    img1_mhds, img1_raws, img1_ids = get_mhd_raw_id(idir1)
    img1_df = pd.DataFrame({'id': img1_ids, 'image_mhd': img1_mhds, 'image_raw': img1_raws})
    mask1_mhds, mask1_raws, mask1_ids = get_mhd_raw_id(mdir1)
    mask1_df = pd.DataFrame({'id': mask1_ids, 'mask_mhd': mask1_mhds, 'mask_raw': mask1_raws})
    img2_mhds, img2_raws, img2_ids = get_mhd_raw_id(idir2)
    img2_df = pd.DataFrame({'id': img2_ids, 'image_mhd': img2_mhds, 'image_raw': img2_raws})
    img2_df['dataset'] = ['data2'] * len(img2_df)
    
    df = img1_df.merge(mask1_df, on='id')
    df['dataset'] = ['data1'] * len(df)
    df = pd.concat([df, img2_df]).sort_values('id').reset_index(drop=True)
    return df




In [None]:
df = get_xvertseg_infos(XVERTSEG_DIR)

## Visualize Data


In [None]:
def load_xvertseg_img(path):
    # https://github.com/juliandewit/kaggle_ndsb2017/blob/master/step1_preprocess_luna16.py
    itk = SimpleITK.ReadImage(path)
    img = SimpleITK.GetArrayFromImage(itk)
    return img, itk




### Look at an image and mask

In [None]:
img, itk = load_xvertseg_img(df.loc[0, 'image_mhd'])

In [None]:
util.animate_crop(img, crop=(0.0, 1, 0.5, 0.8, 0.3, 0.6), step=20)

In [None]:
mask, mitk = load_xvertseg_img(df.loc[0, 'mask_mhd'])

In [None]:
util.animate_crop(mask, crop=(0.0, 1, 0.5, 0.8, 0.3, 0.6), step=20)

### Look at mask

The mask has 6 unique values: 0, 200, 210, 220, 230, 240.  These correspond to background and the vertebrae l1, l2, ..., l5, I think.


In [None]:
np.unique(mask.ravel())

In [None]:
plt.hist(mask.ravel())
plt.show()
# looks like a typical ct scan in hounsfield units...or does it?  No -1000 values?  Looks like the units are hounsfield + 1000.
plt.hist(img.ravel(), bins=50)
plt.show()