In [1]:
import os
import numpy as np
from skimage import io
from matplotlib import pyplot as plt
from scipy.misc import imresize
from scipy import ndimage

%matplotlib inline

import random
import csv
import re
import pickle
import json
from tqdm import tqdm

DATADIR = os.path.expanduser('~/data/kaggle/ultrasound-nerve-segmentation')
PROCDIR = os.path.expanduser('~/ml/kaggle/ultrasound-nerve-segmentation/processed')
MODELDIR = os.path.expanduser('~/ml/kaggle/ultrasound-nerve-segmentation/models')

In [2]:
with open(os.path.join(PROCDIR, 'train_files_map.pkl'), 'rb') as f:
    train_files_map = pickle.load(f)
with open(os.path.join(PROCDIR, 'test_files_map.pkl'), 'rb') as f:
    test_files_map = pickle.load(f)

## create dataset

In [3]:
orig_img_rows, orig_img_cols = 420, 580
resized_img_rows, resized_img_cols = 128, 192
patch_rows, patch_cols = 64, 64
img_channels = 1

# 90/10 train/val split, by subject
data_patches_train = []
data_masks_train = []
data_patches_val = []
data_masks_val = []

subjects = set([f['subject'] for f in train_files_map.values()])
subjects_val = np.random.choice(list(subjects), int(len(subjects) * 0.1), replace=False)

for file_info in tqdm(train_files_map.values()):
    subject, img, folder, img_file, mask_file = (file_info['subject'], 
                                                 file_info['img'], 
                                                 file_info['folder'], 
                                                 file_info['img_file'], 
                                                 file_info['mask_file'])
    img_filepath = os.path.join(folder, img_file)
    mask_filepath = os.path.join(folder, mask_file)
    image = io.imread(img_filepath) / 255.0
    mask = io.imread(mask_filepath) / 255.0
    image_resized = imresize(image, size=(resized_img_rows, resized_img_cols), interp='bilinear')
    mask_resized = imresize(mask, size=(resized_img_rows, resized_img_cols), interp='bilinear').astype(np.bool)
    
    row_stride = patch_rows // 2
    col_stride = patch_cols // 2
    for i in range(resized_img_rows // row_stride):
        for j in range(resized_img_cols // col_stride):
            image_patch = np.zeros((patch_rows, patch_cols))
            mask_patch = np.zeros((patch_rows, patch_cols))
            
            row_start, col_start = i * row_stride, j * col_stride
            row_end, col_end = row_start + patch_rows, col_start + patch_cols
            if row_end >= resized_img_rows:
                row_end = resized_img_rows
            if col_end >= resized_img_cols:
                col_end = resized_img_cols
            image_patch[0:(row_end-row_start), 0:(col_end-col_start)] = image_resized[row_start:row_end, col_start:col_end]
            mask_patch[0:(row_end-row_start), 0:(col_end-col_start)] = mask_resized[row_start:row_end, col_start:col_end]
    
            if subject in subjects_val:
                data_patches_val.append(image_patch)
                data_masks_val.append(mask_patch)
            else:
                data_patches_train.append(image_patch)
                data_masks_train.append(mask_patch)

data_patches_train = np.expand_dims(np.array(data_patches_train, dtype=np.float32), axis=3)
data_masks_train = np.array(data_masks_train, dtype=np.bool)
data_patches_val = np.expand_dims(np.array(data_patches_val, dtype=np.float32), axis=3)
data_masks_val = np.array(data_masks_val, dtype=np.bool)

print('image shapes:', data_patches_train.shape, data_patches_val.shape)
print('mask shapes:', data_masks_train.shape, data_masks_val.shape)

  strip = decompress(strip)
100%|██████████| 5635/5635 [02:14<00:00, 41.75it/s]


image shapes: (123720, 64, 64, 1) (11520, 64, 64, 1)
mask shapes: (123720, 64, 64) (11520, 64, 64)


## save

In [4]:
with open(os.path.join(PROCDIR, 'data_train_val_12.pkl'), 'wb') as f:
    pickle.dump((data_patches_train, data_masks_train,
                 data_patches_val, data_masks_val), f, protocol=4)