# Dataset-Generator.

In [1]:
import os
from datagenerator import DataGenerator
import pickle
import random
import datetime

Using TensorFlow backend.


# Setting the parameters.

In [2]:
if os.path.exists("datasetpath.txt"):
    dataset_path = open("datasetpath.txt", "r").read().replace("\n", "")
else:
    dataset_path = "../data"

# For creating voxelgrids.
dataset_parameters = {}
dataset_parameters["input_type"] = "voxelgrid"
dataset_parameters["output_targets"] = ["height", "weight"]    
dataset_parameters["random_seed"] = 666
dataset_parameters["voxelgrid_target_shape"] = (32, 32, 32)
dataset_parameters["voxel_size_meters"] = 0.1
dataset_parameters["voxelgrid_random_rotation"] = True
dataset_parameters["dataset_size_train"] = 6000
dataset_parameters["dataset_size_test"] = 1000

# For creating pointclouds.
#dataset_parameters = {}
#dataset_parameters["input_type"] = "pointcloud"
#dataset_parameters["output_targets"] = ["height", "weight"]    
#dataset_parameters["random_seed"] = 666
#dataset_parameters["pointcloud_target_size"] = 30000
#dataset_parameters["pointcloud_random_rotation"] = True
#dataset_parameters["dataset_size_train"] = 3000
#dataset_parameters["dataset_size_test"] = 500

# Creating the data-generator.
Makes use of the parameters.

In [3]:
print("Creating data-generator...")
data_generator = DataGenerator(
    dataset_path=dataset_path, 
    input_type=dataset_parameters["input_type"], 
    output_targets=dataset_parameters["output_targets"],
    voxelgrid_target_shape=dataset_parameters.get("voxelgrid_target_shape", None),
    voxel_size_meters=dataset_parameters.get("voxel_size_meters", None),
    voxelgrid_random_rotation=dataset_parameters.get("voxelgrid_random_rotation", None),
    pointcloud_target_size=dataset_parameters.get("pointcloud_target_size", None),
    pointcloud_random_rotation=dataset_parameters.get("pointcloud_random_rotation", None)
)
data_generator.print_statistics()

Creating data-generator...
/Users/tristanbehrens/Datasets/welthungerhilfe/20180727
QR-Code SAM-GOV-025 has 1 different manual measurements
   Target [67, 6.7] with 160 JPGs and 51 PCDs.
QR-Code SAM-GOV-003 has 2 different manual measurements
   Target [92.7, 9.7] with 0 JPGs and 0 PCDs.
   Target [75.3, 7.1] with 0 JPGs and 0 PCDs.
QR-Code SAM-SNG-073 has 3 different manual measurements
   Target [82, 8.8] with 122 JPGs and 41 PCDs.
   Target [82, 8.8] with 136 JPGs and 40 PCDs.
   Target [82, 8.5] with 0 JPGs and 0 PCDs.
QR-Code SAM-GOV-014 has 1 different manual measurements
   Target [91.1, 11.1] with 279 JPGs and 77 PCDs.
QR-Code SAM-GOV-087 has 1 different manual measurements
   Target [85.7, 10.2] with 225 JPGs and 60 PCDs.
QR-Code SAM-GOV-004 has 1 different manual measurements
   Target [76.2, 8.3] with 235 JPGs and 71 PCDs.
QR-Code SAM-GOV-026 has 1 different manual measurements
   Target [80.1, 9.2] with 145 JPGs and 47 PCDs.
QR-Code SAM-GOV-068 has 1 different manual measure

# Analysis.

In [4]:
do_analysis = False
#do_analysis = True

if do_analysis == True:
    data_generator.analyze_files()
    data_generator.analyze_targets()
    data_generator.analyze_pointclouds()
    data_generator.analyze_voxelgrids()
    # how much data per measure?
else:
    print("Skipped analysis.")

Skipped analysis.


# Do the train-test-split and generate.

In [None]:
# Do the split.
random.seed(dataset_parameters["random_seed"])
qrcodes_shuffle = data_generator.qrcodes[:]
random.shuffle(qrcodes_shuffle)
split_index = int(0.8 * len(qrcodes_shuffle))
qrcodes_train = sorted(qrcodes_shuffle[:split_index])
qrcodes_test = sorted(qrcodes_shuffle[split_index:])
del qrcodes_shuffle
print("")

print("QR-Codes for training:", " ".join(qrcodes_train))
print("")
print("QR-Codes for testing:", " ".join(qrcodes_test))
print("")

print("Generating training data...")
dataset_train = next(data_generator.generate(size=dataset_parameters["dataset_size_train"], qrcodes_to_use=qrcodes_train, verbose=True))

print("Generating testing data...")
dataset_test = next(data_generator.generate(size=dataset_parameters["dataset_size_test"], qrcodes_to_use=qrcodes_test, verbose=True))
    
print("Done.")


QR-Codes for training: SAM-02-003-01 SAM-GOV-002 SAM-GOV-003 SAM-GOV-004 SAM-GOV-005 SAM-GOV-011 SAM-GOV-012 SAM-GOV-014 SAM-GOV-023 SAM-GOV-025 SAM-GOV-026 SAM-GOV-033 SAM-GOV-035 SAM-GOV-036 SAM-GOV-037 SAM-GOV-038 SAM-GOV-041 SAM-GOV-042 SAM-GOV-043 SAM-GOV-044 SAM-GOV-046 SAM-GOV-049 SAM-GOV-050 SAM-GOV-051 SAM-GOV-054 SAM-GOV-066 SAM-GOV-068 SAM-GOV-087 SAM-GOV-088 SAM-GOV-089 SAM-GOV-099 SAM-SNG-011 SAM-SNG-012 SAM-SNG-014 SAM-SNG-016 SAM-SNG-021 SAM-SNG-036 SAM-SNG-059 SAM-SNG-061 SAM-SNG-067 SAM-SNG-068 SAM-SNG-070 SAM-SNG-073 SAM-SNG-074 SAM-SNG-076 SAM-SNG-084 SAM-SNG-085 SAM-SNG-086 SAM-SNG-087 SAM-SNG-088 SAM-SNG-095 SAM-SNG-096 SAM-SNG-097 SAM-SNG-098 SAM-SNG-099 prod_test_1 test1

QR-Codes for testing: SAM-GOV-001 SAM-GOV-008 SAM-GOV-013 SAM-GOV-034 SAM-GOV-045 SAM-GOV-052 SAM-GOV-090 SAM-SNG-013 SAM-SNG-015 SAM-SNG-066 SAM-SNG-072 SAM-SNG-075 SAM-SNG-081 SAM-SNG-083 SAM-SNG-091

Generating training data...


  1% (71 of 6000) |                      | Elapsed Time: 0:00:23 ETA:   0:33:45

# Saving.

In [12]:
datetime_string = datetime.datetime.now().strftime("%Y%m%d-%H%M")
dataset_name = datetime_string + "-" + dataset_parameters["input_type"] + "-dataset.p"
pickle.dump((dataset_train, dataset_test, dataset_parameters), open(dataset_name, "wb"))
print("Saved " + dataset_name)

Saved 20180727-1622-voxelgrid-dataset.p


# Loading and analyzing dataset.

In [6]:
#dataset_name = "20180717-1102-voxelgrid-dataset.p"
#dataset_name = "20180717-1236-pointcloud-dataset.p"

#print("Loading dataset...")
#(x_input_train, y_output_train), (x_input_test, y_output_test), dataset_parameters = pickle.load(open(dataset_name, "rb"))
#print("Done.")

Loading dataset...
Done.


In [7]:
#print("Training data input shape:", x_input_train.shape)
#print("Training data output shape:", y_output_train.shape)
#print("Testing data input shape:", x_input_test.shape)
#print("Testing data output shape:", y_output_test.shape)
#print("")

#print("Parameters:")
#print("\n".join([str(key) + ": " + str(value) for key, value in dataset_parameters.items()]))

Training data input shape: (3000,)
Training data output shape: (3000, 2)
Testing data input shape: (500,)
Testing data output shape: (500, 2)

Parameters:
input_type: pointcloud
output_targets: ['height', 'weight']
random_seed: 666
pointcloud_target_size: 30000
pointcloud_random_rotation: True
dataset_size_train: 3000
dataset_size_test: 500


In [None]:
#print(x_input_train[3].shape)