In [1]:
import numpy as np

#### Load images 

In [2]:
import scipy.misc

In [3]:
def load_training_img(index, file_path, file_prefix):
    return scipy.misc.imread(file_path + file_prefix + str(index).zfill(6) + '.png')[:,:,:-1]


def load_images(lower, upper, file_path, file_prefix):
    return np.asarray([load_training_img(i, file_path, file_prefix) 
                        for i in range(lower, upper)])

def flatten(imgs):
    return imgs.reshape((imgs.shape[0], np.prod(imgs.shape[1:])))

# def normalize_and_flatten(imgs):
#     return (imgs / 255.0).reshape(imgs.shape[0], imgs.shape[1]*imgs.shape[2]*imgs.shape[3])

In [4]:
def load_labeling_data(filename, lower, upper, mask=-1):    
    labels = open(filename)
    lines = labels.readlines()[lower:upper]
    labels.close()
    data = np.asarray([[float(d) for d in l.split(',')[:-1]] for l in lines])

    if not mask == -1:
        data = data[:, 5*mask: 5*(mask+1)]
        return data.reshape(upper-lower, 5)
    else:
        return data.reshape(upper-lower, 5*5)



In [5]:
data_path = '/Users/an/factory-robot-data/'
img_prefix = 'capture'
n_examples = 50000

images = flatten(load_images(0, n_examples, data_path, img_prefix))
labels = load_labeling_data(data_path+'labels.dat', 0, n_examples)

In [7]:
n_train = 40000
n_validation = 5000
n_test = n_examples - n_train - n_validation

n_image_features = 64*64*3
n_label_features = 5*5

# label_categories = ['human', 
#                     'pickup', 
#                     'terminal', 
#                     'fire', 
#                     'walls']
# n_label_categories = len(label_categories)
# n_label_features_per_category = 5

# label_category_data_train = [[]] * n_label_categories
# label_category_data_validation = [[]] * n_label_categories
# label_category_data_test = [[]] * n_label_categories
# for i in xrange(n_label_categories):
#     label_category_data_train[i] = labels[:n_train, 
#                                           i*n_label_features_per_category:
#                                           (i+1)*n_label_features_per_category]
#     label_category_data_validation[i] = labels[n_train:n_train+n_validation, i*5:(i+1)*5]
#     label_category_data_test[i] = labels[n_train+n_validation:, i*5:(i+1)*5]

In [8]:
import h5py
f = h5py.File('factory_dataset.hdf5', mode='w')

In [9]:
train_images = images[:n_train]
validation_images = images[n_train:n_train+n_validation]
test_images = images[n_train+n_validation:]

In [10]:
image_features = f.create_dataset(
    'image_features', (n_examples, n_image_features), dtype='uint8')

In [11]:
label_features = f.create_dataset(
    'label_features', (n_examples, n_label_features), dtype='float32')

In [12]:
image_features[...] = images

In [13]:
label_features[...] = labels

In [14]:
image_features.dims[0].label = 'batch'
image_features.dims[1].label = 'feature'
label_features.dims[0].label = 'batch'
label_features.dims[1].label = 'index'

In [15]:
from fuel.datasets.hdf5 import H5PYDataset
split_dict = {
    'train': {'image_features': (0, n_train),
              'label_features': (0, n_train)},
    'validation': {'image_features': (n_train, n_train+n_validation),
              'label_features': (n_train, n_train+n_validation)},
    'test': {'image_features': (n_train+n_validation, n_examples),
              'label_features': (n_train+n_validation, n_examples)}}
f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

In [16]:
f.flush()
f.close()