In [1]:
import os
import numpy as np
from skimage import io
from matplotlib import pyplot as plt
from scipy.misc import imresize
from scipy import ndimage

%matplotlib inline

import random
import csv
import re
import pickle
import json
from tqdm import tqdm
from natsort import natsorted

DATADIR = os.path.expanduser('~/ml/kaggle/ultrasound-nerve-segmentation/data')
PROCDIR = os.path.expanduser('~/ml/kaggle/ultrasound-nerve-segmentation/processed')
MODELDIR = os.path.expanduser('~/ml/kaggle/ultrasound-nerve-segmentation/models')

In [2]:
with open(os.path.join(PROCDIR, 'train_files_map.pkl'), 'rb') as f:
    train_files_map = pickle.load(f)
with open(os.path.join(PROCDIR, 'test_files_map.pkl'), 'rb') as f:
    test_files_map = pickle.load(f)
    
keys_sorted = natsorted(list(train_files_map.keys()))

## presence dataset

In [5]:
orig_img_rows, orig_img_cols = 420, 580
img_rows, img_cols = 96, 128
img_channels = 1

# 90/10 train/val split, by subject
data_images_train = []
data_presence_train = []
data_images_val = []
data_presence_val = []

subjects = set([f['subject'] for f in train_files_map.values()])
subjects_val = np.random.choice(list(subjects), int(len(subjects) * 0.1), replace=False)

for i in tqdm(range(len(keys_sorted))):
    key = keys_sorted[i]
    file_info = train_files_map[key]
    subject, img, folder, img_file, mask_file = (file_info['subject'], 
                                                 file_info['img'], 
                                                 file_info['folder'], 
                                                 file_info['img_file'], 
                                                 file_info['mask_file'])
    img_filepath = os.path.join(folder, img_file)
    mask_filepath = os.path.join(folder, mask_file)
    image = io.imread(img_filepath)
    mask = io.imread(mask_filepath)
    presence = np.sum(mask) > 0
    image_resized = imresize(image, size=(img_rows, img_cols), interp='bilinear') / 255.0
    if subject in subjects_val:
        data_images_val.append(image_resized)
        data_presence_val.append(presence)
    else:
        data_images_train.append(image_resized)
        data_presence_train.append(presence)

data_images_train = np.expand_dims(np.array(data_images_train, dtype=np.float32), axis=3)
data_presence_train = np.array(data_presence_train, dtype=np.bool)
data_images_val = np.expand_dims(np.array(data_images_val, dtype=np.float32), axis=3)
data_presence_val = np.array(data_presence_val, dtype=np.bool)

print('image shapes:', data_images_train.shape, data_images_val.shape)
print('presence shapes:', data_presence_train.shape, data_presence_val.shape)

  strip = decompress(strip)
100%|██████████| 5635/5635 [01:33<00:00, 60.51it/s]


image shapes: (5155, 96, 128, 1) (480, 96, 128, 1)
presence shapes: (5155,) (480,)


In [6]:
multiplier = 5

data_images_train_augment = []
data_presence_train_augment = []
for i in tqdm(range(data_images_train.shape[0])):
    data_images_train_augment.append(data_images_train[i])
    data_presence_train_augment.append(data_presence_train[i])
    for j in range(multiplier):
        y_shift = random.randint(int(data_images_train.shape[1] * -0.125), 
                                 int(data_images_train.shape[1] * 0.125))
        x_shift = random.randint(int(data_images_train.shape[2] * -0.125), 
                                 int(data_images_train.shape[2] * 0.125))
        image_aug = ndimage.interpolation.shift(data_images_train[i], (y_shift, x_shift, 0), order=0)
        presence = data_presence_train[i]
        data_images_train_augment.append(image_aug)
        data_presence_train_augment.append(presence)

index_shuffled = np.arange(len(data_images_train_augment))
np.random.shuffle(index_shuffled)
data_images_train = np.array(data_images_train_augment, dtype=np.float32)[index_shuffled]
data_presence_train = np.array(data_presence_train_augment, dtype=np.bool)[index_shuffled]

print('image shapes:', data_images_train.shape, data_images_val.shape)
print('presence shapes:', data_presence_train.shape, data_presence_val.shape)

100%|██████████| 5155/5155 [00:04<00:00, 1063.81it/s]


image shapes: (30930, 96, 128, 1) (480, 96, 128, 1)
presence shapes: (30930,) (480,)


In [7]:
with open(os.path.join(PROCDIR, 'data_train_val_presence_18a.pkl'), 'wb') as f:
    pickle.dump((data_images_train, data_presence_train, data_images_val, data_presence_val), f, protocol=4)

## segmentation dataset

In [4]:
orig_img_rows, orig_img_cols = 420, 580
img_rows, img_cols = 96, 128
img_channels = 1

# 90/10 train/val split, by subject
data_images_train = []
data_masks_train = []
data_images_val = []
data_masks_val = []
data_images_pos_train = []
data_masks_pos_train = []
data_images_pos_val = []
data_masks_pos_val = []

subjects = set([f['subject'] for f in train_files_map.values()])
subjects_val = np.random.choice(list(subjects), int(len(subjects) * 0.1), replace=False)

for i in tqdm(range(len(keys_sorted))):
    key = keys_sorted[i]
    file_info = train_files_map[key]
    subject, img, folder, img_file, mask_file = (file_info['subject'], 
                                                 file_info['img'], 
                                                 file_info['folder'], 
                                                 file_info['img_file'], 
                                                 file_info['mask_file'])
    img_filepath = os.path.join(folder, img_file)
    mask_filepath = os.path.join(folder, mask_file)
    image = io.imread(img_filepath)
    mask = io.imread(mask_filepath)
    presence = np.sum(mask) > 0
    image_resized = imresize(image, size=(img_rows, img_cols), interp='bilinear') / 255.0
    mask_resized = imresize(mask, size=(img_rows, img_cols), interp='bilinear').astype(np.bool)
    if subject in subjects_val:
        data_images_val.append(image_resized)
        data_masks_val.append(mask_resized)
        if presence:
            data_images_pos_val.append(image_resized)
            data_masks_pos_val.append(mask_resized)
    else:
        data_images_train.append(image_resized)
        data_masks_train.append(mask_resized)
        if presence:
            data_images_pos_train.append(image_resized)
            data_masks_pos_train.append(mask_resized)

data_images_train = np.expand_dims(np.array(data_images_train, dtype=np.float32), axis=3)
data_masks_train = np.array(data_masks_train, dtype=np.bool)
data_images_val = np.expand_dims(np.array(data_images_val, dtype=np.float32), axis=3)
data_masks_val = np.array(data_masks_val, dtype=np.bool)
data_images_pos_train = np.expand_dims(np.array(data_images_pos_train, dtype=np.float32), axis=3)
data_masks_pos_train = np.array(data_masks_pos_train, dtype=np.bool)
data_images_pos_val = np.expand_dims(np.array(data_images_pos_val, dtype=np.float32), axis=3)
data_masks_pos_val = np.array(data_masks_pos_val, dtype=np.bool)

print('image shapes:', 
      data_images_train.shape, data_images_val.shape, 
      data_images_pos_train.shape, data_images_pos_val.shape)
print('mask shapes:', 
      data_masks_train.shape, data_masks_val.shape, 
      data_masks_pos_train.shape, data_masks_pos_val.shape)

  strip = decompress(strip)
100%|██████████| 5635/5635 [01:44<00:00, 53.77it/s]


image shapes: (5156, 96, 128, 1) (479, 96, 128, 1) (2154, 96, 128, 1) (169, 96, 128, 1)
mask shapes: (5156, 96, 128) (479, 96, 128) (2154, 96, 128) (169, 96, 128)


In [5]:
multiplier = 5

data_images_train_augment = []
data_masks_train_augment = []
data_images_pos_train_augment = []
data_masks_pos_train_augment = []
for i in tqdm(range(data_images_train.shape[0])):
    data_images_train_augment.append(data_images_train[i])
    data_masks_train_augment.append(data_masks_train[i])
    for j in range(multiplier):
        y_shift = random.randint(int(data_images_train.shape[1] * -0.125), 
                                 int(data_images_train.shape[1] * 0.125))
        x_shift = random.randint(int(data_images_train.shape[2] * -0.125), 
                                 int(data_images_train.shape[2] * 0.125))
        image_aug = ndimage.interpolation.shift(data_images_train[i], (y_shift, x_shift, 0), order=0)
        mask_aug = ndimage.interpolation.shift(data_masks_train[i], (y_shift, x_shift), order=0)
        data_images_train_augment.append(image_aug)
        data_masks_train_augment.append(mask_aug)
for i in tqdm(range(data_images_pos_train.shape[0])):
    data_images_pos_train_augment.append(data_images_pos_train[i])
    data_masks_pos_train_augment.append(data_masks_pos_train[i])
    for j in range(multiplier):
        y_shift = random.randint(int(data_images_pos_train.shape[1] * -0.125), 
                                 int(data_images_pos_train.shape[1] * 0.125))
        x_shift = random.randint(int(data_images_pos_train.shape[2] * -0.125), 
                                 int(data_images_pos_train.shape[2] * 0.125))
        image_pos_aug = ndimage.interpolation.shift(data_images_pos_train[i], (y_shift, x_shift, 0), order=0)
        mask_pos_aug = ndimage.interpolation.shift(data_masks_pos_train[i], (y_shift, x_shift), order=0)
        data_images_pos_train_augment.append(image_pos_aug)
        data_masks_pos_train_augment.append(mask_pos_aug)

index_shuffled = np.arange(len(data_images_train_augment))
np.random.shuffle(index_shuffled)
data_images_train = np.array(data_images_train_augment, dtype=np.float32)[index_shuffled]
data_masks_train = np.array(data_masks_train_augment, dtype=np.bool)[index_shuffled]
index_shuffled = np.arange(len(data_images_pos_train_augment))
np.random.shuffle(index_shuffled)
data_images_pos_train = np.array(data_images_pos_train_augment, dtype=np.float32)[index_shuffled]
data_masks_pos_train = np.array(data_masks_pos_train_augment, dtype=np.bool)[index_shuffled]

print('image shapes:', 
      data_images_train.shape, data_images_val.shape,
      data_images_pos_train.shape, data_images_pos_val.shape)
print('mask shapes:', 
      data_masks_train.shape, data_masks_val.shape,
      data_masks_pos_train.shape, data_masks_pos_val.shape)

100%|██████████| 5156/5156 [00:08<00:00, 579.19it/s]
100%|██████████| 2154/2154 [00:03<00:00, 580.17it/s]


image shapes: (30936, 96, 128, 1) (479, 96, 128, 1) (12924, 96, 128, 1) (169, 96, 128, 1)
mask shapes: (30936, 96, 128) (479, 96, 128) (12924, 96, 128) (169, 96, 128)


In [6]:
with open(os.path.join(PROCDIR, 'data_train_val_segment_18b.pkl'), 'wb') as f:
    pickle.dump((data_images_train, data_masks_train, 
                 data_images_val, data_masks_val,
                 data_images_pos_train, data_masks_pos_train,
                 data_images_pos_val, data_masks_pos_val), f, protocol=4)

## mask dataset

In [55]:
batch_size = 64

# 90/10 train/val split, by subject
data_masks_train = []
data_masks_val = []

subjects = set([f['subject'] for f in train_files_map.values()])
subjects_val = np.random.choice(list(subjects), int(len(subjects) * 0.1), replace=False)

for file_info in tqdm(train_files_map.values()):
    subject, img, folder, img_file, mask_file = (file_info['subject'], 
                                                 file_info['img'], 
                                                 file_info['folder'], 
                                                 file_info['img_file'], 
                                                 file_info['mask_file'])
    mask_filepath = os.path.join(folder, mask_file)
    mask = io.imread(mask_filepath)
    
    if np.sum(mask) == 0:
        continue
        
    mask_resized = (imresize(mask, size=(img_rows, img_cols), interp='bilinear') / 255.0).astype(np.bool)
    if subject in subjects_val:
        data_masks_val.append(mask_resized)
    else:
        data_masks_train.append(mask_resized)

# divisible by batch_size for train on batch
data_masks_train.extend(data_masks_train[:(batch_size - len(data_masks_train) % batch_size)])
data_masks_val.extend(data_masks_val[:(batch_size - len(data_masks_val) % batch_size)])

index_shuffled = np.arange(len(data_masks_train))
np.random.shuffle(index_shuffled)
data_masks_train = np.array(data_masks_train, dtype=np.bool)[index_shuffled]
index_shuffled = np.arange(len(data_masks_val))
np.random.shuffle(index_shuffled)
data_masks_val = np.array(data_masks_val, dtype=np.bool)[index_shuffled]

print('mask shapes:', data_masks_train.shape, data_masks_val.shape)

  strip = decompress(strip)
100%|██████████| 5635/5635 [00:06<00:00, 814.37it/s]

mask shapes: (2112, 96, 128) (256, 96, 128)





In [56]:
with open(os.path.join(PROCDIR, 'data_train_val_vae_18c.pkl'), 'wb') as f:
    pickle.dump((data_masks_train, data_masks_val), f, protocol=4)