Processing code for image processing of Kaggle Intel data.  
Portions adapted from 1_notmnist code from Udacity Deep Learning Tutorial as a starter code.

Data Wrangle Section:  
Steps:  
1) Load Data - Split training into training and validation (load up test data)  
2) Create label subset  
3) Create master pickle file  
4) Reload to verify images (end to end validation)  

In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
###from scipy import ndimage
import cv2
###from sklearn.linear_model import LogisticRegression
from six.moves import cPickle as pickle

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

#
# Need to display the data
#
from IPython.display import Image
trainDataPathTest='/Users/anthonydaniell/Desktop/FilesToSync/Research/Kaggle/IntelCervix/train/Type_1/'
imageName='998.jpg'
Image(trainDataPathTest+imageName)

Now let's load the data in a more manageable format. Since, depending on your computer setup you might not be able to fit it all in memory, we'll load each class into a separate dataset, store them on disk and curate them independently. Later we'll merge them into a single dataset of manageable size.
We'll convert the entire dataset into a 4D array (image index, x, y, channel) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road.
A few images might not be readable, we'll just skip them.

In [None]:
image_size = 256  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.
image_channels = 3 # Blue, Green, Red
trainDataPath=\
['/Users/anthonydaniell/Desktop/FilesToSync/Research/Kaggle/IntelCervix/train/Type_1']

def load_images(folder, min_num_images):
  """Load the data for a single cervix type."""
  image_files_raw = os.listdir(folder)
#
# Only extract .jpg files
#
  image_files = []
  for testFile in image_files_raw:
    if testFile[len(testFile)-4:len(testFile)] == '.jpg':
        image_files.append(testFile)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size, image_channels),
                         dtype=np.float32)
  print('load_images: folder = ', folder)
  print('load_images: image_files = ', image_files)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
###      image_data = (ndimage.imread(image_file).astype(float) - 
###                    pixel_depth / 2) / pixel_depth
###      print('image_file = ', image_file)
      image_data_in = cv2.imread(image_file,1)  # 1 = RGB image type
      height, width, channels = image_data_in.shape
      # filter for images that don't meet minimums expected.
      if height < image_size or width < image_size or channels < image_channels:
        raise Exception('Unexpected image shape: %s' % str(image_data_in.shape))
      image_data = cv2.resize(image_data_in, (image_size, image_size))
###      if image_data.shape != (image_size, image_size):
###        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  print('maybe_pickle: data_folders = ', data_folders)
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_images(folder, min_num_images_per_class)
      try:
        pass
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(trainDataPath, 5, force=True)
###test_datasets = maybe_pickle(test_folders, 500)

In [None]:
#
# Display an example of the data
#
import matplotlib.pyplot as plt # - matplotlib shows reverse.  
### Might be useful for discrimination
cur_data = train_datasets[0]
with open(cur_data, 'rb') as f:
    dataset = pickle.load(f)
img = dataset[4,:,:]
plt.imshow(img)
###cv2.waitKey(0)
###cv2.destroyAllWindows()

Merge and prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune train_size as needed. The labels will be stored into a separate array of integers 0 through 9.
Also create a validation dataset for hyperparameter tuning.

In [None]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

In [None]:
pickle_file = os.path.join(data_root, 'kaggleIntel.pickle')

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

CNN section