In [1]:
label_to_int = {'Boots': 1, 'Sandals': 2, 'Shoes': 3, 'Slippers': 4}

In [2]:
import tensorflow as tf
import os
from PIL import Image
import numpy as np

def get_all_images(path, resize):
    img_list = []
    for filename in os.listdir(path):
        file_path = path + '/' + filename
        if os.path.isdir(file_path):
            img_list += get_all_images(file_path, resize)
        elif filename.lower().endswith('.jpg'):
            with Image.open(file_path) as img:
                #transform images before saving
                    #grayscale
                img = img.convert('LA')
                    #resize
                img = img.resize(resize)
                img_list.append(np.array(img))
          
    return img_list
    
#convert back to img
#arr2im = Image.fromarray(im2arr)
from random import sample

def tt_split(img_array, lbl_array, test_ratio):
    l = img_array.shape[0] #length of data 
    f = int(l*test_ratio) #number of elements you need (test elems)
    test_indices = sample(range(l),f)
    train_indices = [i for i in range(l) if i not in test_indices]
    
    if (bool(set(test_indices) & set(train_indices))):
        print('WARNING: incorrect split! Test and Train overlap!')
        

    test_data_img = img_array[test_indices]
    train_data_img = img_array[train_indices]
    test_data_lbl = lbl_array[test_indices]
    train_data_lbl = lbl_array[train_indices]
    
    return train_data_img, test_data_img, train_data_lbl, test_data_lbl
    
    

def read_shoe_data(directory, dtype=tf.uint8, test_ratio=0.2, resize=(28, 28)):
    #resize is (width, height)

    img_list = []
    label_list = []
    for filename in os.listdir(directory):
        path = directory + '/' + filename
        if os.path.isdir(path):
            label = filename
            label_path = path
            #add all images in all subfolders to list
            img_list_for_label = get_all_images(label_path, resize)
            img_list += img_list_for_label
            label_list += ([label_to_int[label]] * len(img_list_for_label))
            
            print('current total img_list has len:', len(img_list))
            print('current total label_list has len:', len(label_list))
    img_array = np.array(img_list)
    label_array = np.array(label_list)
            
    return tt_split(img_array, label_array, test_ratio)
            
        
train_data_img, test_data_img, train_data_lbl, test_data_lbl = read_shoe_data('data/ut-zap50k-images')

  from ._conv import register_converters as _register_converters


current total img_list has len: 12832
current total label_list has len: 12832
current total img_list has len: 18573
current total label_list has len: 18573
current total img_list has len: 19856
current total label_list has len: 19856
current total img_list has len: 50025
current total label_list has len: 50025


In [3]:
print('train_data_img len:', train_data_img.shape)
print('test_data_img len:', test_data_img.shape)

train_data_img len: (40020, 28, 28, 2)
test_data_img len: (10005, 28, 28, 2)


In [4]:
img = Image.fromarray(train_data_img[0])
img.show()

img = Image.fromarray(test_data_img[0])
img.show()

In [5]:
class ImageLabelHolder():
    def __init__(self, image_array, label_array):
        self.num_examples = len(image_array)
        self.images = image_array
        self.labels = label_array
        

In [6]:
class Dataset():
    def __init__(self, train_img_lbls, test_img_lbls):
        self.train = train_img_lbls
        self.test = test_img_lbls

In [7]:
dataset = Dataset(ImageLabelHolder(train_data_img, train_data_lbl), ImageLabelHolder(test_data_img, test_data_lbl))

In [11]:
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Converts MNIST data to TFRecords file format with Example protos."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import sys

import tensorflow as tf

from tensorflow.contrib.learn.python.learn.datasets import mnist

FLAGS = None

dirname = 'shoetmp/data'


def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def convert_to(data_set, name):
  """Converts a dataset to tfrecords."""
  images = data_set.images
  labels = data_set.labels
  num_examples = data_set.num_examples

  if images.shape[0] != num_examples:
    raise ValueError('Images size %d does not match label size %d.' %
                     (images.shape[0], num_examples))
  rows = images.shape[1]
  cols = images.shape[2]
  depth = images.shape[3]

  filename = os.path.join(FLAGS.directory, name + '.tfrecords')
  print('Writing', filename)
  with tf.python_io.TFRecordWriter(filename) as writer:
    for index in range(num_examples):
      image_raw = images[index].tostring()
      example = tf.train.Example(
          features=tf.train.Features(
              feature={
                  'height': _int64_feature(rows),
                  'width': _int64_feature(cols),
                  'depth': _int64_feature(depth),
                  'label': _int64_feature(int(labels[index])),
                  'image_raw': _bytes_feature(image_raw)
              }))
      writer.write(example.SerializeToString())


def old_main(unused_argv):
  print(type(FLAGS))
  print(FLAGS.directory)
  # Get the data.

  #this implicitly makes the relevant directory
  data_sets = mnist.read_data_sets(FLAGS.directory,
                                   dtype=tf.uint8,
                                   reshape=False,
                                   validation_size=FLAGS.validation_size)
    
  print(type(data_sets.train.images))

  img_train = data_sets.train.images
    
  print(img_train.shape)

  # Convert to Examples and write the result to TFRecords.
  convert_to(data_sets.train, 'train')
  convert_to(data_sets.validation, 'validation')
  convert_to(data_sets.test, 'test')


    
    

def main(unused_argv):
  # Get the data.

  #read in images from files and build a numpy array after resizing to 28x28 and gray-scaling.
    
  #need a training and testing examples. 

  #may as welll include labels
    

  data_sets = dataset

  if not os.path.exists(dirname):
      os.makedirs(dirname)

  # Convert to Examples and write the result to TFRecords.
  convert_to(data_sets.train, 'train')
  #convert_to(data_sets.validation, 'validation')
  convert_to(data_sets.test, 'test')


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--directory',
      type=str,
      default=dirname,
      help='Directory to download data files and write the converted result'
  )
  parser.add_argument(
      '--validation_size',
      type=int,
      default=5000,
      help="""\
      Number of examples to separate from the training data for the validation
      set.\
      """
  )
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

Writing shoetmp/data/train.tfrecords
Writing shoetmp/data/test.tfrecords


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
