In [1]:
import numpy as np
import os
from scipy import ndimage
from six.moves import cPickle as pickle
import random
from skimage import io, color

In [2]:
# pickle the benign, malignant subfolder in train_folders and test_folders.
# change the 'train_folders' and 'test_folders' to locate your processed image data.
width = 40
height = 40
pixel_depth = 255.0

train_folders = ['/home/ubuntu/code/train_folder7/benign', '/home/ubuntu/code/train_folder7/malignant']
test_folders = ['/home/ubuntu/code/test_folder7/benign', '/home/ubuntu/code/test_folder7/malignant']

def load_file(folder):
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), height, width),
                         dtype=np.float32)
    image_index = 0
    print(folder)
    for image in os.listdir(folder):
        image_file = os.path.join(folder, image)
        try:
            image_data = (color.rgb2gray(ndimage.imread(image_file).astype(float)) - 
                        pixel_depth / 2) / pixel_depth
            if image_data.shape != (height, width):
                raise Exception('Unexpected image shape: %s' % str(image_data.shape))
            dataset[image_index, :, :] = image_data
            image_index += 1
        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
    num_images = image_index
    dataset = dataset[0:num_images, :, :]
    
    print('Full dataset tensor:', dataset.shape)
    print('Mean:', np.mean(dataset))
    print('Standard deviation:', np.std(dataset))
    return dataset
        
def maybe_pickle(data_folders, force=False):
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
            print('%s already present - Skipping pickling.' % set_filename)
        else:
            print('Pickling %s.' % set_filename)
            dataset = load_file(folder, min_num_images_per_class)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)
  
    return dataset_names

train_datasets = maybe_pickle(train_folders)
test_datasets = maybe_pickle(test_folders)

/home/ubuntu/code/train_folder7/benign.pickle already present - Skipping pickling.
/home/ubuntu/code/train_folder7/malignant.pickle already present - Skipping pickling.
/home/ubuntu/code/test_folder7/benign.pickle already present - Skipping pickling.
/home/ubuntu/code/test_folder7/malignant.pickle already present - Skipping pickling.


In [3]:
# check the number of dataset in each subfolder
def disp_number_images(data_folders):
    for folder in data_folders:
        pickle_filename = ''.join(folder) + '.pickle'
        try:
            with open(pickle_filename, 'rb') as f:
                dataset = pickle.load(f)
        except Exception as e:
            print('Unable to read data from', pickle_filename, ':', e)
            return
        print('Number of images in ', folder, ' : ', len(dataset))
    
disp_number_images(train_folders)
disp_number_images(test_folders)

('Number of images in ', '/home/ubuntu/code/train_folder7/benign', ' : ', 10080)
('Number of images in ', '/home/ubuntu/code/train_folder7/malignant', ' : ', 10080)
('Number of images in ', '/home/ubuntu/code/test_folder7/benign', ' : ', 1440)
('Number of images in ', '/home/ubuntu/code/test_folder7/malignant', ' : ', 1440)


In [4]:
# generate three 'data/label' pairs: train_dataset/train_labels, valid_dataset/valid_labels, test_dataset/test_labels
# change the train_size, valid_size, and test_size based on the results in the above cell. 
# test_size = (num of images in test/benign) + (num of images in test/malignant)
# valid_size = test_size
# train_size = (num of images in train/benign) + (num of images in tain/malignant) - valid_size

train_size = 17280
valid_size = 2880
test_size = 2880

def make_arrays(nb_rows, height, width):
    if nb_rows:
        dataset = np.ndarray((nb_rows, height, width), dtype=np.float32)
        labels = np.ndarray(nb_rows, dtype=np.int32)
    else:
        dataset, labels = None, None
    return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, height, width)
    train_dataset, train_labels = make_arrays(train_size, height, width)
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes

    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class + tsize_per_class
    for label, pickle_file in enumerate(pickle_files): 
        print(label, pickle_file)
        try:
            with open(pickle_file, 'rb') as f:
                class_set = pickle.load(f)
                np.random.shuffle(class_set)
            if valid_dataset is not None:
                # None means it's a test set
                valid_class = class_set[:vsize_per_class, :, :]
                valid_dataset[start_v:end_v, :, :] = valid_class
                valid_labels[start_v:end_v] = label
                start_v += vsize_per_class
                end_v += vsize_per_class

            train_class = class_set[vsize_per_class:end_l, :, :]
            train_dataset[start_t:end_t, :, :] = train_class
            train_labels[start_t:end_t] = label
            start_t += tsize_per_class
            end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise

    return valid_dataset, valid_labels, train_dataset, train_labels

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

(0, '/home/ubuntu/code/train_folder7/benign.pickle')
(1, '/home/ubuntu/code/train_folder7/malignant.pickle')
(0, '/home/ubuntu/code/test_folder7/benign.pickle')
(1, '/home/ubuntu/code/test_folder7/malignant.pickle')
('Training:', (17280, 40, 40), (17280,))
('Validation:', (2880, 40, 40), (2880,))
('Testing:', (2880, 40, 40), (2880,))


In [5]:
# shuffle the data
def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [6]:
# save the pickle file with three 'data/label' pairs
pickle_file = 'breast.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise