# Dataset-Generator.

In [None]:
import os
from datagenerator import DataGenerator
import pickle
import random
import utils

In [None]:
import multiprocessing
multiprocessing_jobs = multiprocessing.cpu_count()
print("Going to spawn", multiprocessing_jobs, "jobs...")

# Setting the parameters.

In [None]:
if os.path.exists("datasetpath.txt"):
    dataset_path = open("datasetpath.txt", "r").read().replace("\n", "")
else:
    dataset_path = "../data"

# For creating voxelgrids.
dataset_parameters_voxelgrids = {}
dataset_parameters_voxelgrids["input_type"] = "voxelgrid"
dataset_parameters_voxelgrids["output_targets"] = ["height", "weight"]    
dataset_parameters_voxelgrids["random_seed"] = 666
dataset_parameters_voxelgrids["voxelgrid_target_shape"] = (32, 32, 32)
dataset_parameters_voxelgrids["voxel_size_meters"] = 0.1
dataset_parameters_voxelgrids["voxelgrid_random_rotation"] = True
dataset_parameters_voxelgrids["dataset_size_train"] = 6000
dataset_parameters_voxelgrids["dataset_size_test"] = 1000

# For creating pointclouds.
dataset_parameters_pointclouds = {}
dataset_parameters_pointclouds["input_type"] = "pointcloud"
dataset_parameters_pointclouds["output_targets"] = ["height", "weight"]    
dataset_parameters_pointclouds["random_seed"] = 666
dataset_parameters_pointclouds["pointcloud_target_size"] = 30000
dataset_parameters_pointclouds["pointcloud_random_rotation"] = True
dataset_parameters_pointclouds["dataset_size_train"] = 3000
dataset_parameters_pointclouds["dataset_size_test"] = 500

# Creating the data-generator.
Makes use of the parameters.

In [None]:
def create_datagenerator(dataset_parameters):
    print("Creating data-generator...")
    datagenerator = DataGenerator(
        dataset_path=dataset_path, 
        input_type=dataset_parameters["input_type"], 
        output_targets=dataset_parameters["output_targets"],
        voxelgrid_target_shape=dataset_parameters.get("voxelgrid_target_shape", None),
        voxel_size_meters=dataset_parameters.get("voxel_size_meters", None),
        voxelgrid_random_rotation=dataset_parameters.get("voxelgrid_random_rotation", None),
        pointcloud_target_size=dataset_parameters.get("pointcloud_target_size", None),
        pointcloud_random_rotation=dataset_parameters.get("pointcloud_random_rotation", None)
    )
    datagenerator.print_statistics()
    return datagenerator

# Analysis.

In [None]:
def analyze(datagenerator):
    datagenerator.analyze_files()
    datagenerator.analyze_targets()
    datagenerator.analyze_pointclouds()
    datagenerator.analyze_voxelgrids()

# Do the train-test-split and generate.

In [None]:
def split_and_generate(datagenerator, dataset_parameters):

    # Do the split.
    random.seed(dataset_parameters["random_seed"])
    qrcodes_shuffle = datagenerator.qrcodes[:]
    random.shuffle(qrcodes_shuffle)
    split_index = int(0.8 * len(qrcodes_shuffle))
    qrcodes_train = sorted(qrcodes_shuffle[:split_index])
    qrcodes_test = sorted(qrcodes_shuffle[split_index:])
    del qrcodes_shuffle
    print("")

    print("QR-Codes for training:", " ".join(qrcodes_train))
    print("")
    print("QR-Codes for testing:", " ".join(qrcodes_test))
    print("")

    print("Generating training data...")
    dataset_train = next(datagenerator.generate(size=dataset_parameters["dataset_size_train"], qrcodes_to_use=qrcodes_train, verbose=True, multiprocessing_jobs=multiprocessing_jobs))

    print("Generating testing data...")
    dataset_test = next(datagenerator.generate(size=dataset_parameters["dataset_size_test"], qrcodes_to_use=qrcodes_test, verbose=True, multiprocessing_jobs=multiprocessing_jobs))

    print("Done.")
    return dataset_train, dataset_test

# Method for saving dataset.

In [None]:
def save_dataset(dataset_train, dataset_test, dataset_parameters):
    print("Saving dataset...")
    datetime_string = utils.get_datetime_string()
    dataset_name = datetime_string + "-" + dataset_parameters["input_type"] + "-dataset.p"
    pickle.dump((dataset_train, dataset_test, dataset_parameters), open(dataset_name, "wb"))
    print("Saved " + dataset_name)

# Generate with parameters.

In [None]:
dataset_parameters_to_use = []
dataset_parameters_to_use.append(dataset_parameters_pointclouds)
dataset_parameters_to_use.append(dataset_parameters_voxelgrids)

for dataset_parameters in dataset_parameters_to_use:
    
    datagenerator = create_datagenerator(dataset_parameters)
    #analyze(datagenerator)
    
    dataset_train, dataset_test = split_and_generate(datagenerator, dataset_parameters)
    
    save_dataset(dataset_train, dataset_test, dataset_parameters)
    