In [1]:
import tensorflow as tf
from PIL import Image
import numpy as np
import os
import sys
import cv2

In [3]:
train_path = "./data/train/"
test_path = "./data/test/"
LABEL_CAT = 0
LABEL_DOG = 1
BATCH_SIZE = 50

In [5]:
def get_files(file_dir):
    '''
    Args:
        file_dir: file directory
    Returns:
        list of images and labels
    '''
    cats, label_cats = [], []
    dogs, label_dogs = [], []
    for file in os.listdir(file_dir):
        name = file.split('.')
        if name[0]=='cat':
            cats.append(file_dir + file)
            label_cats.append(LABEL_CAT)
        else:
            dogs.append(file_dir + file)
            label_dogs.append(LABEL_DOG)
    print('There are %d cats\nThere are %d dogs' %(len(cats), len(dogs)))
    
    image_list = np.hstack((cats, dogs))
    label_list = np.hstack((label_cats, label_dogs))
    
    temp = np.array([image_list, label_list])
    temp = temp.transpose()
    np.random.shuffle(temp)
    
    image_list = list(temp[:, 0])
    label_list = list(temp[:, 1])
    label_list = [int(i) for i in label_list]
    
    return image_list, label_list
 
def int64_feature(value):
  """Wrapper for inserting int64 features into Example proto."""
  if not isinstance(value, list):
    value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
 
def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

## 将数据保存为tfrecord 的格式
def convert_to_tfrecord(images, labels, save_dir, name):
    '''convert all images and labels to one tfrecord file.
    Args:
        images: list of image directories, string type
        labels: list of labels, int type
        save_dir: the directory to save tfrecord file, e.g.: '/home/folder1/'
        name: the name of tfrecord file, string type, e.g.: 'train'
    Return:
        no return
    Note:
        converting needs some time, be patient...
    '''
    
    filename =  save_dir + name + '.tfrecords'
    n_samples = len(labels)
    
    if np.shape(images)[0] != n_samples:
        raise ValueError('Images size %d does not match label size %d.' %(images.shape[0], n_samples))
    
    # wait some time here, transforming need some time based on the size of your data.
    writer = tf.python_io.TFRecordWriter(filename)
    print('\nTransform start......')
    for i in np.arange(0, n_samples):
        try:
            sys.stdout.write("\r>>Coverting image %d / %d " % (i+1, n_samples))
            sys.stdout.flush()
#             img = Image.open(images[i])
#             img_raw = img.tobytes() 
#             label = int(labels[i])
            image = cv2.imread(images[i])
            image = cv2.resize(image, (208, 208))
            b, g, r = cv2.split(image)
            rgb_image = cv2.merge([r,g,b])
            image_raw = rgb_image.tostring()
            label = int(labels[i])
            example = tf.train.Example(features=tf.train.Features(feature={
                                        'label':int64_feature(label),
                                        'image_raw': bytes_feature(image_raw)}))
            writer.write(example.SerializeToString())
        except IOError as e:
            print('Could not read:', images[i])
            print('error: %s' %e)
            print('Skip it!\n')
    sys.stdout.write("\n")
    sys.stdout.flush()
    print('Transform done!')

train_images, train_labels = get_files(train_path)
convert_to_tfrecord(train_images, train_labels, "tfrecoard/", "CatVsDog")


There are 12500 cats
There are 12500 dogs

Transform start......
>>Coverting image 25000 / 25000 
Transform done!
