# xVertSeg Data Generation

For training and evaluation of a binary image segmentation algorithm, we want input of size mxnxnxnx1 and output of size (m, n, n, n, 1).  1 channel.  m examples.  3d volume.  For semantic image segmentation, we want output of size (m, n, n, n, 6) for the one-hot encoded vector of output categories.  xVertSeg also has 2 categorical outcomes.

It would be nice to configure the datagen to sample multiple patches from an image.  That might speed training.

## Imports and Constants, etc.

In [None]:
import datetime
import importlib
import keras
from keras.layers import (Dense, SimpleRNN, Input, Conv1D, 
                          LSTM, GRU, AveragePooling3D, MaxPooling3D, GlobalMaxPooling3D,
                          Conv3D, UpSampling3D, BatchNormalization, Concatenate, Add,
                          GaussianNoise, Dropout
                         )
from keras.models import Model
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import projd
import random
import re
import scipy
import shutil
import SimpleITK # xvertseg MetaImage files
import sys
from sklearn.model_selection import train_test_split
import uuid

import matplotlib.pyplot as plt # data viz
import seaborn as sns # data viz

import imageio # display animated volumes
from IPython.display import Image # display animated volumes

from IPython.display import SVG # visualize model
from keras.utils.vis_utils import model_to_dot # visualize model

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import util
import preprocessing
import datagen
import modelutil
import xvertseg
import augmentation


# MODEL_NAME = 'model_15'
SEED = 25 # random seed
EPOCHS = 100
BATCH_SIZE = 4
PATCH_SHAPE = (32, 32, 32)
# PATCH_SHAPE = (64, 64, 64)
PATCH_SHAPE = (128, 128, 128) # good for visualization.
VALIDATION_SPLIT = 0.25

DATA_DIR = Path('/data2').expanduser()
# DATA_DIR = Path('~/data/2018').expanduser()
# UVMMC
NORMAL_SCANS_DIR = DATA_DIR / 'uvmmc/nifti_normals'
PROJECT_DATA_DIR = DATA_DIR / 'uvm_deep_learning_project'
PP_IMG_DIR = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed' # preprocessed scans dir
PP_MD_PATH = PROJECT_DATA_DIR / 'uvmmc' / 'preprocessed_metadata.pkl'
# xVertSeg
XVERTSEG_DIR = DATA_DIR / 'xVertSeg.v1'
PP_XVERTSEG_DIR = PROJECT_DATA_DIR / 'xVertSeg.v1' / 'preprocessed' # preprocessed scans dir
PP_XVERTSEG_MD_PATH = PROJECT_DATA_DIR / 'xVertSeg.v1' / 'preprocessed_metadata.pkl'


MODELS_DIR = PROJECT_DATA_DIR / 'models'
LOG_DIR = PROJECT_DATA_DIR / 'log'
TENSORBOARD_LOG_DIR = PROJECT_DATA_DIR / 'tensorboard'
TMP_DIR = DATA_DIR / 'tmp'

for d in [DATA_DIR, NORMAL_SCANS_DIR, PROJECT_DATA_DIR, PP_IMG_DIR, MODELS_DIR, LOG_DIR, 
          TENSORBOARD_LOG_DIR, TMP_DIR, PP_MD_PATH.parent, PP_XVERTSEG_DIR, PP_XVERTSEG_MD_PATH.parent]:
    if not d.exists():
        d.mkdir(parents=True)
        
%matplotlib inline
sns.set()

# I love u autoreload!
%load_ext autoreload
%autoreload 2

In [None]:
# df = pd.DataFrame({
#     'id': range(1, 26),
#     'pp_image_path': [Path(f'image{i:03}.mhd') for i in range(1, 26)],
#     'pp_mask_path': [Path(f'mask{i:03}.mhd') for i in range(1, 16)] + ([np.nan] * 10),
# })
# infos_func = lambda: df

infos_func = lambda: xvertseg.read_xvertseg_metadata(PP_XVERTSEG_MD_PATH)
train, val, test = xvertseg.get_xvertseg_datagens(
    infos_func, seed=SEED, validation_split=0.2, test_split=0.134)
train.config(batch_size=1, crop_shape=PATCH_SHAPE, flip=0.5, transpose=True, gray_std=0.5, gray_disco=True)
# no crop shape and consequently, no transpose.
# train.config(crop_shape=None, flip=0.5, transpose=False, gray_std=0.05, gray_disco=True)


In [None]:
len(train)


In [None]:
# The gray_std=0.5 and gray_disco=True looks so cool with the transpose=True!
print('train size:', len(train))
for i in range(len(train)):
    x, y = train[i]
    print('y dtype', y.dtype)
    print(x.shape)
    display(util.animate_crop(x[0, :, :, :, 0], step=20))
    display(util.animate_crop(y[0, :, :, :, 0], step=20))
    

## Calculate Proportion of Ones in Binary Mask

This is used to guide the weighting of classes in the loss function for binary segmentation.

In [None]:
# The gray_std=0.5 and gray_disco=True looks so cool with the transpose=True!
zeros = 0
ones = 0
others = 0
train.config(crop_shape=PATCH_SHAPE, transpose=None, flip=None, gray_std=None)
for i in range(len(train)):
    x, y = train[i]
    zeros += np.sum(y == 0)
    ones += np.sum(y == 1)
    others += np.sum((y != 0) & (y != 1))
print('zeros, ones, others:', zeros, ones, others)
print('proportion of zeros:', zeros / (zeros + ones + others))
print('proportion of ones:', ones / (zeros + ones + others))
print('proportion of others:', others / (zeros + ones + others))
print('zeros-to-ones ratio:', zeros / ones)
print('ones-to-zeros ratio:', ones / zeros)


Whole image.  10 training samples, seed of 25
```
zeros, ones, others: 558443731 3017212 0
proportion of zeros: 0.9946261408961442
proportion of ones: 0.005373859103855778
proportion of others: 0.0
zeros-to-ones ratio: 185.08601019749358
ones-to-zeros ratio: 0.005402893492236194
```

Ran a couple times before setting seed.  Also saw a zeros-to-ones ratio of 162 and 194.

Random crop for each image produces varied results.
```
zeros-to-ones ratio: 187.70820285786272
zeros-to-ones ratio: 39.10471985144927
zeros-to-ones ratio: 52.257486489780995
zeros-to-ones ratio: 149.27099843792544
zeros-to-ones ratio: 24.490191788547627
zeros-to-ones ratio: 212.60277042167448
```
It makes sense that the mean would be less than the whole picture mean, since the probability of pixels near the edge (which are typically 0/black) being in a crop are less than central pixel b/c of the way cropping and uniform sampling interact.


## Test `num_samples` Feature

Use `num_samples` to increase epoch size.  Currently images are randomized and samples can be random crops, but multiple samples for the same image come out one after the other (to avoid image reloading).  Not ideal from a stochastic perspective.

In [None]:
# df = pd.DataFrame({
#     'id': range(1, 26),
#     'pp_image_path': [Path(f'image{i:03}.mhd') for i in range(1, 26)],
#     'pp_mask_path': [Path(f'mask{i:03}.mhd') for i in range(1, 16)] + ([np.nan] * 10),
# })
# infos_func = lambda: df

# no crop shape and consequently, no transpose.
# train.config(crop_shape=None, flip=0.5, transpose=False, gray_std=0.05, gray_disco=True)


In [None]:
train.config(batch_size=3, shuffle=True, length=None, crop_shape=(32, 32, 32), num_samples=2, 
             gray_std=None, transpose=None, flip=None).reindex()

# The gray_std=0.5 and gray_disco=True looks so cool with the transpose=True!
print('train size:', len(train))
print('batch size:', train.batch_size)
print('num_samples:', train.num_samples)
for i in range(len(train)):
    x, y = train[i]
    print(x.shape)
    for j in range(x.shape[0]): # batch size
        display(util.animate_crop(x[j, :, :, :, 0], step=20))
        display(util.animate_crop(y[j, :, :, :, 0], step=20))
    

### Test shuffled_idxs

In [None]:
train.config(length=None, batch_size=1, shuffle=True, num_samples=1)
print(train._shuffle())
print(train._shuffle())


In [None]:
# Test random info indices for length 15 sequence
train.config(length=15, batch_size=1, shuffle=True, num_samples=1)
print(train._shuffle())
print(train._shuffle())


In [None]:
# Length < len(infos)
train.config(length=5, batch_size=1, shuffle=True, num_samples=1)
print(train._shuffle())
print(train._shuffle())


In [None]:
# test that number of indices is the product of batch_size and length
train.config(length=5, batch_size=4, shuffle=True, num_samples=1)
print(train._shuffle())
print(train._shuffle())


In [None]:
# Shuffle = False iterates through infos in order
train.config(length=15, batch_size=1, shuffle=False, num_samples=1)
print(train._shuffle())
print(train._shuffle())


### Test `length` Datagens

In [None]:
train.config(length=15, batch_size=1, shuffle=True, num_samples=1).reindex()
print('train size:', len(train))
for i in range(len(train)):
    x, y = train[i]
    print('y dtype', y.dtype)
    print(x.shape)
    display(util.animate_crop(x[0, :, :, :, 0], step=20))
    display(util.animate_crop(y[0, :, :, :, 0], step=20))
    