In [6]:
# Data processing
import h5py
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm, trange

def downsize_and_split(
    input_file: str, output_dir: str, train_size: int, val_size: int, test_size: int
):
    """
    This function is meant to be used to regularize the sizes of individual problem types
    (e.g. merged cubbies without neutral poses). Use this function on the output of either
    `merge_data_pipeline_files` or `extract_hybrid_expert_data` (depending on whether you
    want all the problems with global expert solutions or the subset that have hybrid expert solutions
    as well). This function will create three datasets, a train, val, and test dataset.
    If any of the sizes are set to 0, it will ignore that dataset

    :param input_file str: The input file, should come from one of the functions described above
    :param output_dir str: The output directory (this directory should exist but doesn't
                           need any subdirectories)
    :param train_size int: The size of the training dataset
    :param val_size int: The size of the validation dataset
    :param test_size int: The size of the test dataset
    """
    with h5py.File(input_file) as f:
        assert train_size + val_size + test_size < len(f["cuboid_centers"])
        indices = np.random.choice(
            np.arange(len(f["cuboid_centers"])),
            size=train_size + test_size + val_size,
            replace=False,
        )
        train_indices, val_indices, test_indices = (
            np.sort(indices[:train_size]),
            np.sort(indices[train_size : train_size + val_size]),
            np.sort(indices[train_size + val_size :]),
        )

        assert (
            len(train_indices) + len(val_indices) + len(test_indices)
            == train_size + val_size + test_size
        )

        path = Path(output_dir).resolve()

        if val_size > 0:
            (path / "val").mkdir(parents=True, exist_ok=True)
            with h5py.File(str(path / "val" / "val.hdf5"), "w-") as g:
                for k in f.keys():
                    g.create_dataset(k, (val_size, *f[k].shape[1:]))
                for ii, jj in enumerate(tqdm(val_indices)):
                    for k in g.keys():
                        g[k][ii, ...] = f[k][jj, ...]
        if test_size > 0:
            (path / "test").mkdir(parents=True, exist_ok=True)
            with h5py.File(str(path / "test" / "test.hdf5"), "w-") as g:
                for k in f.keys():
                    g.create_dataset(k, (test_size, *f[k].shape[1:]))
                for ii, jj in enumerate(tqdm(test_indices)):
                    for k in g.keys():
                        g[k][ii, ...] = f[k][jj, ...]
        if train_size > 0:
            (path / "train").mkdir(parents=True, exist_ok=True)
            with h5py.File(str(path / "train" / "train.hdf5"), "w-") as g:
                for k in f.keys():
                    g.create_dataset(k, (train_size, *f[k].shape[1:]))
                for ii, jj in enumerate(tqdm(train_indices)):
                    for k in g.keys():
                        g[k][ii, ...] = f[k][jj, ...]

In [7]:
input_file="../data/cubby/neutral/curobo/all_data.hdf5"
with h5py.File(input_file) as f:
    # Get the number of samples in the dataset
    num_samples = len(f["cuboid_centers"])
    print(f"Number of samples in the dataset: {num_samples}")
    
    # Get the keys in the dataset
    dataset_keys = list(f.keys())
    print(f"Keys in the dataset: {dataset_keys}")
    
    
    # check global_solutions trajectory length
    global_solutions = f["global_solutions"]
    trajectory_length = global_solutions.shape[1]
    print(f"Trajectory length of global_solutions: {trajectory_length}")

Number of samples in the dataset: 10552
Keys in the dataset: ['cuboid_centers', 'cuboid_dims', 'cuboid_quaternions', 'cylinder_centers', 'cylinder_heights', 'cylinder_quaternions', 'cylinder_radii', 'global_solutions']
Trajectory length of global_solutions: 50


In [8]:
downsize_and_split(input_file="../data/cubby/neutral/curobo/all_data.hdf5", 
                   output_dir="pretrain_data", 
                   train_size=6000, 
                   val_size=1000, 
                   test_size=0)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

In [5]:
# Test loading the data
from data_loader import DataModule

# Define parameters for the DataModule
data_dir = "pretrain_data"  # Replace with the actual path to your data
trajectory_key = "global_solutions"  # Replace with the actual key in your dataset
num_robot_points = 2048
num_obstacle_points = 4096
random_scale = 0  # For MpiNet, it's 0.015 
batch_size = 32

# Initialize the DataModule
data_module = DataModule(
    data_dir=data_dir,
    trajectory_key=trajectory_key,
    num_robot_points=num_robot_points,
    num_obstacle_points=num_obstacle_points,
    random_scale=random_scale,
    batch_size=batch_size,
)

# Setup the DataModule
data_module.setup(stage="fit")

# Access the dataloaders
train_loader = data_module.train_dataloader()
val_loader = data_module.val_dataloader()

Databases found: [PosixPath('pretrain_data/train/train.hdf5')]
Databases found: [PosixPath('pretrain_data/val/val.hdf5')]


In [6]:
import torch
# Check the number of batches in the train and validation loaders
print(f"Number of batches in train loader: {len(train_loader)}")
print(f"Number of batches in validation loader: {len(val_loader)}")


# Check is there is NaN in xyz of Validation data
for batch in val_loader:
    # Check if there are NaN values in the batch
    if torch.isnan(batch["xyz"]).any():
        print("NaN found in validation data!")
        break
else:
    print("No NaN found in validation data.")

Number of batches in train loader: 7813
Number of batches in validation loader: 63
No NaN found in validation data.
