In [4]:
import sys
sys.path.append("../")
import import_ipynb
from config import dogs_vs_cats_config as config
from sklearn.preprocessing import LabelEncoder
from pyimage.preprocessing.preprocessors import AspectAwarePreprocessor
from pyimage.io.hdf5py import HDF5DatasetWriter
from imutils import paths
import numpy as np
import progressbar
import json
import cv2
import os

In [None]:
def prep():

  # grab the paths to the images
  trainPaths = list(paths.list_images(config.IMAGES_PATH))

  # image filename is of the format train/{class}.{#IMAGE}.jpg
  trainLabels = [p.split(os.path.sep)[-1].split(".")[0] for p in trainPaths]
  le = LabelEncoder()
  trainLabels = le.fit_transform(trainLabels)


  # perform stratified sampling from the training set to build the testing
  # split from th training data
  split = train_test_split(trainPaths, trainLabels, test_size=config.NUM_TEST_IMAGES, 
                           stratify=trainLabels, random_state=42)
  (trainPaths, testPaths, trainLabels, testLabels) = split

  # perform another stratified sampling this time to build the validation data
  split = train_test_split(trainPaths, trainLabels,
                          test_size=config.NUM_VAL_IMAGES, stratify=trainLabels,
                          random_state=42)
  (trainPaths, valPaths, trainLabels, valLabels) = split

In [None]:
def build_dataset():
    
    
  # construct a list pairing the training, validation, and testing
  # image paths along with their corresponding labels and output HDF5
  # files
  datasets = [
      ("train", trainPaths, trainLabels, config.TRAIN_HDF5),
      ("val", valPaths, valLabels, config.VAL_HDF5),
      ("test", testPaths, testLabels, config.TEST_HDF5)
  ]

  # init the image preprocessor and list of RGB channel averages
  aap = AspectAwarePreprocessor(256, 256)
  (R, G, B) = ([], [], [])

  # build the HDF5 dataset
  for (dType, paths, labels, outputPath) in datasets:
      #create HDF5 writer
      print("INFO building {}...".format(outputPath))
    writer = HDF5DatasetWriter((len(paths), 256, 256, 3), outputPath)

    # init the progressbar
    widgets = ["Building Dataset: ", progressbar.Percentage(), " ",
              progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval=len(paths), widgets=widgets).start()

    # loop over image paths
    for (i, (path, label)) in enumerate(zip(paths, labels)):
      image = cv2.imread(path)
      image = aap.preprocess(image)

      # if we are building the training dataset, compute mean of
      # each channel in image, the update respective lists
      if dType=="train":
        (b,g,r) = cv2.mean(image)[:3]
        R.append(r)
        G.append(g)
        B.append(b)

      # add image and label # to HDF5 dataset
      writer.add([image], [label])
      pbar.update[i]

    #close HDF5 writer
    pbar.finish()
    writer.close()

  # The final step is to serialize our RGB averages to disk
  print("INFO serializing means...")
  D = {"R": np.mean(R), "G":np.mean(G), "B":np.mean(B)}
  f = open(config.DATASET_MEAN, "w")
  f.write(json.dumps(D))
  f.close()