# Dataset-Generator.

In [1]:
import os
from datagenerator import DataGenerator
import pickle
import random
import datetime

Using TensorFlow backend.


# Setting the parameters.

In [2]:
if os.path.exists("datasetpath.txt"):
    dataset_path = open("datasetpath.txt", "r").read().replace("\n", "")
else:
    dataset_path = "../data"

# For creating voxelgrids.
dataset_parameters_voxelgrids = {}
dataset_parameters_voxelgrids["input_type"] = "voxelgrid"
dataset_parameters_voxelgrids["output_targets"] = ["height", "weight"]    
dataset_parameters_voxelgrids["random_seed"] = 666
dataset_parameters_voxelgrids["voxelgrid_target_shape"] = (32, 32, 32)
dataset_parameters_voxelgrids["voxel_size_meters"] = 0.1
dataset_parameters_voxelgrids["voxelgrid_random_rotation"] = True
dataset_parameters_voxelgrids["dataset_size_train"] = 6000
dataset_parameters_voxelgrids["dataset_size_test"] = 1000

# For creating pointclouds.
dataset_parameters_pointclouds = {}
dataset_parameters_pointclouds["input_type"] = "pointcloud"
dataset_parameters_pointclouds["output_targets"] = ["height", "weight"]    
dataset_parameters_pointclouds["random_seed"] = 666
dataset_parameters_pointclouds["pointcloud_target_size"] = 30000
dataset_parameters_pointclouds["pointcloud_random_rotation"] = True
dataset_parameters_pointclouds["dataset_size_train"] = 3000
dataset_parameters_pointclouds["dataset_size_test"] = 500

# Creating the data-generator.
Makes use of the parameters.

In [3]:
def create_datagenerator(dataset_parameters):
    print("Creating data-generator...")
    datagenerator = DataGenerator(
        dataset_path=dataset_path, 
        input_type=dataset_parameters["input_type"], 
        output_targets=dataset_parameters["output_targets"],
        voxelgrid_target_shape=dataset_parameters.get("voxelgrid_target_shape", None),
        voxel_size_meters=dataset_parameters.get("voxel_size_meters", None),
        voxelgrid_random_rotation=dataset_parameters.get("voxelgrid_random_rotation", None),
        pointcloud_target_size=dataset_parameters.get("pointcloud_target_size", None),
        pointcloud_random_rotation=dataset_parameters.get("pointcloud_random_rotation", None)
    )
    datagenerator.print_statistics()
    return datagenerator

# Analysis.

In [4]:
def analyze(datagenerator):
    datagenerator.analyze_files()
    datagenerator.analyze_targets()
    datagenerator.analyze_pointclouds()
    datagenerator.analyze_voxelgrids()

# Do the train-test-split and generate.

In [5]:
def split_and_generate(datagenerator, dataset_parameters):

    # Do the split.
    random.seed(dataset_parameters["random_seed"])
    qrcodes_shuffle = datagenerator.qrcodes[:]
    random.shuffle(qrcodes_shuffle)
    split_index = int(0.8 * len(qrcodes_shuffle))
    qrcodes_train = sorted(qrcodes_shuffle[:split_index])
    qrcodes_test = sorted(qrcodes_shuffle[split_index:])
    del qrcodes_shuffle
    print("")

    print("QR-Codes for training:", " ".join(qrcodes_train))
    print("")
    print("QR-Codes for testing:", " ".join(qrcodes_test))
    print("")

    print("Generating training data...")
    dataset_train = next(datagenerator.generate(size=dataset_parameters["dataset_size_train"], qrcodes_to_use=qrcodes_train, verbose=True))

    print("Generating testing data...")
    dataset_test = next(datagenerator.generate(size=dataset_parameters["dataset_size_test"], qrcodes_to_use=qrcodes_test, verbose=True))

    print("Done.")
    return dataset_train, dataset_test

# Method for saving dataset.

In [6]:
def save_dataset(dataset_train, dataset_test, dataset_parameters):
    print("Saving dataset...")
    datetime_string = datetime.datetime.now().strftime("%Y%m%d-%H%M")
    dataset_name = datetime_string + "-" + dataset_parameters["input_type"] + "-dataset.p"
    pickle.dump((dataset_train, dataset_test, dataset_parameters), open(dataset_name, "wb"))
    print("Saved " + dataset_name)

# Generate with parameters.

In [7]:
dataset_parameters_to_use = []
dataset_parameters_to_use.append(dataset_parameters_pointclouds)
dataset_parameters_to_use.append(dataset_parameters_voxelgrids)

for dataset_parameters in dataset_parameters_to_use:
    
    datagenerator = create_datagenerator(dataset_parameters)
    #analyze(datagenerator)
    
    dataset_train, dataset_test = split_and_generate(datagenerator, dataset_parameters)
    
    save_dataset(dataset_train, dataset_test, dataset_parameters)
    

Creating data-generator...
/Users/tristanbehrens/Datasets/welthungerhilfe/20180727
QR-Code SAM-GOV-025 has 1 different manual measurements
   Target [67, 6.7] with 160 JPGs and 51 PCDs.
QR-Code SAM-GOV-003 has 2 different manual measurements
   Target [92.7, 9.7] with 0 JPGs and 0 PCDs.
   Target [75.3, 7.1] with 0 JPGs and 0 PCDs.
QR-Code SAM-SNG-073 has 3 different manual measurements
   Target [82, 8.8] with 122 JPGs and 41 PCDs.
   Target [82, 8.8] with 136 JPGs and 40 PCDs.
   Target [82, 8.5] with 0 JPGs and 0 PCDs.
QR-Code SAM-GOV-014 has 1 different manual measurements
   Target [91.1, 11.1] with 279 JPGs and 77 PCDs.
QR-Code SAM-GOV-087 has 1 different manual measurements
   Target [85.7, 10.2] with 225 JPGs and 60 PCDs.
QR-Code SAM-GOV-004 has 1 different manual measurements
   Target [76.2, 8.3] with 235 JPGs and 71 PCDs.
QR-Code SAM-GOV-026 has 1 different manual measurements
   Target [80.1, 9.2] with 145 JPGs and 47 PCDs.
QR-Code SAM-GOV-068 has 1 different manual measure

  4% (147 of 3000) |#                    | Elapsed Time: 0:00:50 ETA:   0:15:03

Wrong number of columns at line 26769


  6% (203 of 3000) |#                    | Elapsed Time: 0:01:10 ETA:   0:16:46

Wrong number of columns at line 26558


  7% (223 of 3000) |#                    | Elapsed Time: 0:01:18 ETA:   0:17:56

Wrong number of columns at line 26553


  8% (258 of 3000) |#                    | Elapsed Time: 0:01:30 ETA:   0:15:53

Wrong number of columns at line 33561


 13% (404 of 3000) |##                   | Elapsed Time: 0:02:21 ETA:   0:14:10

Wrong number of columns at line 26769


 20% (627 of 3000) |####                 | Elapsed Time: 0:03:35 ETA:   0:12:50

Wrong number of columns at line 26553


 22% (661 of 3000) |####                 | Elapsed Time: 0:03:46 ETA:   0:12:48

Wrong number of columns at line 26670


 22% (670 of 3000) |####                 | Elapsed Time: 0:03:50 ETA:   0:12:50

Wrong number of columns at line 25616


 22% (676 of 3000) |####                 | Elapsed Time: 0:03:52 ETA:   0:13:49

Wrong number of columns at line 26736


 24% (720 of 3000) |#####                | Elapsed Time: 0:04:08 ETA:   0:15:12

Wrong number of columns at line 33561


 33% (1009 of 3000) |######              | Elapsed Time: 0:05:45 ETA:   0:11:11

Wrong number of columns at line 26636


 34% (1034 of 3000) |######              | Elapsed Time: 0:05:54 ETA:   0:11:06

Wrong number of columns at line 33561


 47% (1415 of 3000) |#########           | Elapsed Time: 0:07:54 ETA:   0:08:22

Wrong number of columns at line 26667


 47% (1417 of 3000) |#########           | Elapsed Time: 0:07:55 ETA:   0:09:27

Wrong number of columns at line 26636


 48% (1447 of 3000) |#########           | Elapsed Time: 0:08:05 ETA:   0:08:26

Wrong number of columns at line 33593


 48% (1450 of 3000) |#########           | Elapsed Time: 0:08:06 ETA:   0:09:31

Wrong number of columns at line 25919


 54% (1633 of 3000) |##########          | Elapsed Time: 0:09:06 ETA:   0:07:10

Wrong number of columns at line 26769


 57% (1729 of 3000) |###########         | Elapsed Time: 0:09:41 ETA:   0:06:55

Wrong number of columns at line 26636


 58% (1750 of 3000) |###########         | Elapsed Time: 0:09:48 ETA:   0:06:44

Wrong number of columns at line 25919
Wrong number of columns at line 25616


 67% (2018 of 3000) |#############       | Elapsed Time: 0:11:19 ETA:   0:05:39

Wrong number of columns at line 33593


 67% (2032 of 3000) |#############       | Elapsed Time: 0:11:24 ETA:   0:05:13

Wrong number of columns at line 26769


 70% (2111 of 3000) |##############      | Elapsed Time: 0:11:49 ETA:   0:04:31

Wrong number of columns at line 26526


 72% (2165 of 3000) |##############      | Elapsed Time: 0:12:07 ETA:   0:04:22

Wrong number of columns at line 26736


 75% (2257 of 3000) |###############     | Elapsed Time: 0:12:36 ETA:   0:03:50

Wrong number of columns at line 26769


 78% (2357 of 3000) |###############     | Elapsed Time: 0:13:08 ETA:   0:03:29

Wrong number of columns at line 33683


 80% (2402 of 3000) |################    | Elapsed Time: 0:13:23 ETA:   0:03:10

Wrong number of columns at line 26222


 80% (2428 of 3000) |################    | Elapsed Time: 0:13:31 ETA:   0:03:08

Wrong number of columns at line 26650


 84% (2536 of 3000) |################    | Elapsed Time: 0:14:06 ETA:   0:02:22

Wrong number of columns at line 26558


 85% (2562 of 3000) |#################   | Elapsed Time: 0:14:14 ETA:   0:02:23

Wrong number of columns at line 26222


 97% (2925 of 3000) |################### | Elapsed Time: 0:16:18 ETA:   0:00:24

Wrong number of columns at line 25919


100% (3000 of 3000) |####################| Elapsed Time: 0:16:43 Time:  0:16:43


Generating testing data...
Generating using QR-codes: ['SAM-GOV-001', 'SAM-GOV-008', 'SAM-GOV-013', 'SAM-GOV-034', 'SAM-GOV-045', 'SAM-GOV-052', 'SAM-GOV-090', 'SAM-SNG-013', 'SAM-SNG-015', 'SAM-SNG-066', 'SAM-SNG-072', 'SAM-SNG-075', 'SAM-SNG-081', 'SAM-SNG-083', 'SAM-SNG-091']


 25% (125 of 500) |#####                 | Elapsed Time: 0:00:42 ETA:   0:02:14

Wrong number of columns at line 25900


 56% (285 of 500) |############          | Elapsed Time: 0:01:34 ETA:   0:01:08

Wrong number of columns at line 25977


100% (500 of 500) |######################| Elapsed Time: 0:02:50 Time:  0:02:50


Done.
Saved 20180731-1245-pointcloud-dataset.p
Creating data-generator...
/Users/tristanbehrens/Datasets/welthungerhilfe/20180727
QR-Code SAM-GOV-025 has 1 different manual measurements
   Target [67, 6.7] with 160 JPGs and 51 PCDs.
QR-Code SAM-GOV-003 has 2 different manual measurements
   Target [92.7, 9.7] with 0 JPGs and 0 PCDs.
   Target [75.3, 7.1] with 0 JPGs and 0 PCDs.
QR-Code SAM-SNG-073 has 3 different manual measurements
   Target [82, 8.8] with 122 JPGs and 41 PCDs.
   Target [82, 8.8] with 136 JPGs and 40 PCDs.
   Target [82, 8.5] with 0 JPGs and 0 PCDs.
QR-Code SAM-GOV-014 has 1 different manual measurements
   Target [91.1, 11.1] with 279 JPGs and 77 PCDs.
QR-Code SAM-GOV-087 has 1 different manual measurements
   Target [85.7, 10.2] with 225 JPGs and 60 PCDs.
QR-Code SAM-GOV-004 has 1 different manual measurements
   Target [76.2, 8.3] with 235 JPGs and 71 PCDs.
QR-Code SAM-GOV-026 has 1 different manual measurements
   Target [80.1, 9.2] with 145 JPGs and 47 PCDs.
QR-

  2% (147 of 6000) |                     | Elapsed Time: 0:00:50 ETA:   0:30:09

Wrong number of columns at line 26769


  3% (203 of 6000) |                     | Elapsed Time: 0:01:09 ETA:   0:36:37

Wrong number of columns at line 26558


  3% (223 of 6000) |                     | Elapsed Time: 0:01:16 ETA:   0:30:44

Wrong number of columns at line 26553


  4% (258 of 6000) |                     | Elapsed Time: 0:01:27 ETA:   0:31:00

Wrong number of columns at line 33561


  6% (404 of 6000) |#                    | Elapsed Time: 0:02:16 ETA:   0:34:56

Wrong number of columns at line 26769


 10% (627 of 6000) |##                   | Elapsed Time: 0:03:40 ETA:   0:35:33

Wrong number of columns at line 26553


 11% (661 of 6000) |##                   | Elapsed Time: 0:03:52 ETA:   0:31:45

Wrong number of columns at line 26670


 11% (670 of 6000) |##                   | Elapsed Time: 0:03:55 ETA:   0:33:15

Wrong number of columns at line 25616


 11% (676 of 6000) |##                   | Elapsed Time: 0:03:58 ETA:   0:33:47

Wrong number of columns at line 26736


 12% (720 of 6000) |##                   | Elapsed Time: 0:04:13 ETA:   0:31:19

Wrong number of columns at line 33561


 16% (1009 of 6000) |###                 | Elapsed Time: 0:05:59 ETA:   0:27:50

Wrong number of columns at line 26636


 17% (1034 of 6000) |###                 | Elapsed Time: 0:06:08 ETA:   0:27:29

Wrong number of columns at line 33561


 23% (1415 of 6000) |####                | Elapsed Time: 0:08:14 ETA:   0:25:08

Wrong number of columns at line 26667


 23% (1417 of 6000) |####                | Elapsed Time: 0:08:15 ETA:   0:27:08

Wrong number of columns at line 26636


 24% (1447 of 6000) |####                | Elapsed Time: 0:08:25 ETA:   0:26:25

Wrong number of columns at line 33593


 24% (1450 of 6000) |####                | Elapsed Time: 0:08:26 ETA:   0:28:03

Wrong number of columns at line 25919


 27% (1633 of 6000) |#####               | Elapsed Time: 0:09:25 ETA:   0:23:30

Wrong number of columns at line 26769


 28% (1729 of 6000) |#####               | Elapsed Time: 0:09:56 ETA:   0:22:47

Wrong number of columns at line 26636


 29% (1750 of 6000) |#####               | Elapsed Time: 0:10:03 ETA:   0:22:32

Wrong number of columns at line 25919
Wrong number of columns at line 25616


 33% (2018 of 6000) |######              | Elapsed Time: 0:11:36 ETA:   0:22:16

Wrong number of columns at line 33593


 33% (2032 of 6000) |######              | Elapsed Time: 0:11:41 ETA:   0:22:49

Wrong number of columns at line 26769


 35% (2111 of 6000) |#######             | Elapsed Time: 0:12:07 ETA:   0:20:53

Wrong number of columns at line 26526


 36% (2165 of 6000) |#######             | Elapsed Time: 0:12:25 ETA:   0:21:23

Wrong number of columns at line 26736


 37% (2257 of 6000) |#######             | Elapsed Time: 0:12:57 ETA:   0:20:18

Wrong number of columns at line 26769


 39% (2357 of 6000) |#######             | Elapsed Time: 0:13:31 ETA:   0:19:54

Wrong number of columns at line 33683


 40% (2402 of 6000) |########            | Elapsed Time: 0:13:46 ETA:   0:19:20

Wrong number of columns at line 26222


 40% (2428 of 6000) |########            | Elapsed Time: 0:13:55 ETA:   0:19:59

Wrong number of columns at line 26650


 42% (2536 of 6000) |########            | Elapsed Time: 0:14:31 ETA:   0:18:30

Wrong number of columns at line 26558


 42% (2562 of 6000) |########            | Elapsed Time: 0:14:40 ETA:   0:19:14

Wrong number of columns at line 26222


 48% (2925 of 6000) |#########           | Elapsed Time: 0:16:46 ETA:   0:17:22

Wrong number of columns at line 25919


 52% (3131 of 6000) |##########          | Elapsed Time: 0:17:59 ETA:   0:16:03

Wrong number of columns at line 33593


 53% (3185 of 6000) |##########          | Elapsed Time: 0:18:18 ETA:   0:15:49

Wrong number of columns at line 25969


 53% (3215 of 6000) |##########          | Elapsed Time: 0:18:29 ETA:   0:17:16

Wrong number of columns at line 26667


 54% (3244 of 6000) |##########          | Elapsed Time: 0:18:39 ETA:   0:15:56

Wrong number of columns at line 25021


 62% (3740 of 6000) |############        | Elapsed Time: 0:21:29 ETA:   0:12:10

Wrong number of columns at line 26650


 67% (4021 of 6000) |#############       | Elapsed Time: 0:23:01 ETA:   0:11:20

Wrong number of columns at line 26636


 68% (4122 of 6000) |#############       | Elapsed Time: 0:23:34 ETA:   0:09:51

Wrong number of columns at line 25919


 68% (4130 of 6000) |#############       | Elapsed Time: 0:23:37 ETA:   0:10:09

Wrong number of columns at line 26553


 71% (4289 of 6000) |##############      | Elapsed Time: 0:24:30 ETA:   0:09:22

Wrong number of columns at line 26065
Wrong number of columns at line 25616


 74% (4481 of 6000) |##############      | Elapsed Time: 0:25:32 ETA:   0:08:01

Wrong number of columns at line 33481


 76% (4574 of 6000) |###############     | Elapsed Time: 0:26:03 ETA:   0:07:44

Wrong number of columns at line 25616


 76% (4619 of 6000) |###############     | Elapsed Time: 0:26:18 ETA:   0:07:37

Wrong number of columns at line 33593


 78% (4720 of 6000) |###############     | Elapsed Time: 0:26:51 ETA:   0:06:44

Wrong number of columns at line 26553


 82% (4950 of 6000) |################    | Elapsed Time: 0:28:06 ETA:   0:05:45

Wrong number of columns at line 26667


 83% (5024 of 6000) |################    | Elapsed Time: 0:28:30 ETA:   0:05:09

Wrong number of columns at line 25021


 84% (5061 of 6000) |################    | Elapsed Time: 0:28:42 ETA:   0:05:05

Wrong number of columns at line 33593


 84% (5084 of 6000) |################    | Elapsed Time: 0:28:50 ETA:   0:05:05

Wrong number of columns at line 33604


 86% (5207 of 6000) |#################   | Elapsed Time: 0:29:30 ETA:   0:04:13

Wrong number of columns at line 25021


 87% (5249 of 6000) |#################   | Elapsed Time: 0:29:44 ETA:   0:04:05

Wrong number of columns at line 25021


 88% (5322 of 6000) |#################   | Elapsed Time: 0:30:08 ETA:   0:03:35

Wrong number of columns at line 26553


 94% (5668 of 6000) |##################  | Elapsed Time: 0:32:00 ETA:   0:02:07

Wrong number of columns at line 33683


 96% (5813 of 6000) |################### | Elapsed Time: 0:32:47 ETA:   0:00:58

Wrong number of columns at line 26636


100% (6000 of 6000) |####################| Elapsed Time: 0:33:47 Time:  0:33:47


Generating testing data...
Generating using QR-codes: ['SAM-GOV-001', 'SAM-GOV-008', 'SAM-GOV-013', 'SAM-GOV-034', 'SAM-GOV-045', 'SAM-GOV-052', 'SAM-GOV-090', 'SAM-SNG-013', 'SAM-SNG-015', 'SAM-SNG-066', 'SAM-SNG-072', 'SAM-SNG-075', 'SAM-SNG-081', 'SAM-SNG-083', 'SAM-SNG-091']


 20% (202 of 1000) |####                 | Elapsed Time: 0:01:04 ETA:   0:04:13

Wrong number of columns at line 25900


 25% (252 of 1000) |#####                | Elapsed Time: 0:01:20 ETA:   0:04:18

Wrong number of columns at line 25900


 99% (991 of 1000) |#################### | Elapsed Time: 0:05:16 ETA:   0:00:02

Wrong number of columns at line 25977


100% (1000 of 1000) |####################| Elapsed Time: 0:05:19 Time:  0:05:19


Done.
Saved 20180731-1325-voxelgrid-dataset.p
