In [None]:
#library to work with vectors and matrix
import numpy as np
import h5py
import cv2
import os, inspect

#garbage collector
import gc
#system
import sys
#work with xml file
import xml.etree.cElementTree as ET

#plot figures, charts, etc
import matplotlib.pyplot as plt 

from tqdm import tqdm

#preprocessing method
from preprocessing import rgb_preprocessing, depth_preprocessing

In [None]:
def createFileHDF5(pathFile, nameFile):
    path = os.path.join(pathFile,nameFile)
    #create hdf5 file
    file = h5py.File(path, 'w')
    #create group training, dev and test
    train = file.create_group('train')
    #create sub group input and output
    _=train.create_group('input')
    _=train.create_group('output')
        
    dev = file.create_group('dev')
    #create sub group input and output
    _=dev.create_group('input')
    _=dev.create_group('output')
        
    test = file.create_group('test')
    #create sub group input and output
    _=test.create_group('input')
    _=test.create_group('output')
        
    #close file
    #file.close()
    return file

In [None]:
def loadTag(pathDataInput, pathDataOutput, holdout, windows, shuffle=False):
    dataImages = []
    if sum(holdout) ==100:
        onlyfilesInput = [f for f in os.listdir(pathDataInput) if os.path.isdir(os.path.join(pathDataInput, f))]
        onlyfilesOutput = [f for f in os.listdir(pathDataOutput) if os.path.isdir(os.path.join(pathDataOutput, f))]
        if windows>=1:
            for pathInput, pathOutput in zip(onlyfilesInput,onlyfilesOutput):
                inImg = os.listdir(os.path.join(pathDataInput,pathInput))
                outImg = os.listdir(os.path.join(pathDataOutput,pathOutput))
                zipped = list(zip(inImg,outImg))
                np.random.shuffle(zipped)
                imgs = [(pathInput+"/"+inImg[img], pathOutput+"/"+outImg[img]) for img in range(0,len(inImg),windows)]
                dataImages.extend(np.squeeze(imgs))
        else:
            raise Exception('{0} should be more than 1.'.format(windows))
    else:
        raise Exception('{0} should sum 1.'.format(str(holdout)))
    if shuffle:
            np.random.shuffle(dataImages)
    train = int(np.floor(len(dataImages)*(holdout[0]/100)))
    dev = int(np.floor(len(dataImages)*(holdout[1]/100)))
    dataImages = np.array(dataImages)
    return [dataImages[:train], dataImages[train:train+dev], dataImages[train+dev:]],pathDataInput, pathDataOutput

In [None]:
def readImgRGB(pathRGB, imgName, shape):
    addr = os.path.join(pathRGB, imgName)
    img = cv2.imread(addr)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = rgb_preprocessing(img, shape[0],shape[1])
    return img

In [None]:
def readImgDepth(pathDepth, imgName, shape):
    tree = ET.parse(os.path.join(pathDepth, imgName))
    filename, _ = os.path.splitext(imgName.split("/")[1])
    elem = tree.find('%s/data' % filename) #busca la etiqueta "data" dentro del xml
    height= int(tree.find('%s/height' % filename).text) #busca la etiqueta "alto" dentro del xml
    width= int(tree.find('%s/width' % filename).text) #busca la etiqueta "ancho" dentro del xml
    strData = elem.text
    floatData = list(map(lambda x: np.int16(x), strData.split()))
    depthData = np.array(floatData).reshape((height, width))
    depthData = depth_preprocessing(depthData,shape[0],shape[1])
    return np.array(depthData)

In [None]:
#tags:array => [train, dev, test]
def createDataHDF5(filehdf5, tags, inPath, outPath, shapeImg):
    for group in filehdf5:
        print("set up group {0} in h5py file.".format(group))
        for subgroup in filehdf5[group]:
            uriHdf5 = filehdf5["{0}/{1}".format(group,subgroup)]
            if group == "train":
                if subgroup == "input":
                    print("=>set up input data from {0}".format(group))
                    dataset = uriHdf5.create_dataset("imgs", (len(tags[0]),)+shapeImg, np.float32)
                    for i, train in tqdm(enumerate(tags[0])):
                        dataset[i,...] = readImgRGB(inPath, train[0], shapeImg)
                if subgroup == "output":
                    print("=>set up output data from {0}".format(group))
                    dataset = uriHdf5.create_dataset("imgs",(len(tags[0]),)+shapeImg, np.float64)
                    for i, train in tqdm(enumerate(tags[0])):
                        dataset[i,...] = readImgDepth(outPath, train[1], shapeImg)
            if group == "dev":
                if subgroup == "input":
                    print("=>set up input data from {0}".format(group))
                    dataset = uriHdf5.create_dataset("imgs",(len(tags[1]),)+shapeImg, np.float32)
                    for i, dev in tqdm(enumerate(tags[1])):
                        dataset[i,...] = readImgRGB(inPath, dev[0], shapeImg)
                if subgroup == "output":
                    print("=>set up output data from {0}".format(group))
                    dataset = uriHdf5.create_dataset("imgs",(len(tags[1]),)+shapeImg, np.float64)
                    for i, dev in tqdm(enumerate(tags[1])):
                        dataset[i,...] = readImgDepth(outPath, dev[1], shapeImg)
            if group == "test":
                if subgroup == "input":
                    print("=>set up input data from {0}".format(group))
                    dataset = uriHdf5.create_dataset("imgs",(len(tags[2]),)+shapeImg, np.float32)
                    for i, test in tqdm(enumerate(tags[2])):
                        dataset[i, ...] = readImgRGB(inPath, test[0], shapeImg)
                if subgroup == "output":
                    print("=>set up output data from {0}".format(group))
                    dataset = uriHdf5.create_dataset("imgs",(len(tags[2]),)+shapeImg, np.float64)
                    for i, test in tqdm(enumerate(tags[2])):
                        dataset[i,...] = readImgDepth(outPath, test[1], shapeImg)
    print("succesfully completed")

In [None]:
#pathFile:String => path where h5py file is created
#nameFile:String => name for h5py file
#pathDataInput:String => path from our rgb directory (input)
#pathDataOutput:String => path from our depth directory (output)
#holdout:Int array => how our dataset would be divided [train, dev, test]
#windows:Int => skip image in each stream from our dataset (rgb, depth)
#shuffle:Bool => if its true then we apply shuffle, otherwise leave this
#shapeImg:shape after preprocess => (height,width, chanels)
def main(pathFile, nameFile, pathDataInput, pathDataOutput, holdout, windows, shuffle, shapeImg):
    filehdf5 = createFileHDF5(pathFile, nameFile)
    tags, inPath, outPath = loadTag(pathDataInput, pathDataOutput, holdout, windows, shuffle)
    createDataHDF5(filehdf5, tags, inPath, outPath, shapeImg)
    filehdf5.close()
#currentPath = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 
#pathFile = os.path.join(currentPath, "preprocess data")
#nameFile = "CGAN.h5"   
#pathDataInput = os.path.join(currentPath, "dataset/rgb")
#pathDataOutput = os.path.join(currentPath, "dataset/depth")
#holdout = [80,15,5]
#windows = 10
#shuffle = True
#shapeImg = (224,224,3)
#main(pathFile, nameFile, pathDataInput, pathDataOutput, holdout, windows, shuffle, shapeImg)