In [16]:
import numpy as np
import torch
import os
import zarr
from tqdm import tqdm
from matplotlib import pyplot as plt

In [3]:
def create_sample_indices(
        episode_ends:np.ndarray, sequence_length:int,
        pad_before: int=0, pad_after: int=0):
    indices = list()
    for i in range(len(episode_ends)):
        start_idx = 0
        if i > 0:
            start_idx = episode_ends[i-1]
        end_idx = episode_ends[i]
        episode_length = end_idx - start_idx

        min_start = -pad_before
        max_start = episode_length - sequence_length + pad_after

        # range stops one idx before end
        for idx in range(min_start, max_start+1):
            buffer_start_idx = max(idx, 0) + start_idx
            buffer_end_idx = min(idx+sequence_length, episode_length) + start_idx
            start_offset = buffer_start_idx - (idx+start_idx)
            end_offset = (idx+sequence_length+start_idx) - buffer_end_idx
            sample_start_idx = 0 + start_offset
            sample_end_idx = sequence_length - end_offset
            indices.append([
                buffer_start_idx, buffer_end_idx,
                sample_start_idx, sample_end_idx])
    indices = np.array(indices)
    return indices


def sample_sequence(train_data, sequence_length,
                    buffer_start_idx, buffer_end_idx,
                    sample_start_idx, sample_end_idx):
    result = dict()
    for key, input_arr in train_data.items():
        sample = input_arr[buffer_start_idx:buffer_end_idx]
        data = sample
        if (sample_start_idx > 0) or (sample_end_idx < sequence_length):
            data = np.zeros(
                shape=(sequence_length,) + input_arr.shape[1:],
                dtype=input_arr.dtype)
            if sample_start_idx > 0:
                data[:sample_start_idx] = sample[0]
            if sample_end_idx < sequence_length:
                data[sample_end_idx:] = sample[-1]
            data[sample_start_idx:sample_end_idx] = sample
        result[key] = data
    return result

# normalize data
def get_data_stats(data):
    data = data.reshape(-1,data.shape[-1])
    stats = {
        'min': np.min(data, axis=0),
        'max': np.max(data, axis=0)
    }
    return stats

def normalize_data(data, stats):
    # nomalize to [0,1]
    ndata = (data - stats['min']) / (stats['max'] - stats['min'])
    # normalize to [-1, 1]
    ndata = ndata * 2 - 1
    return ndata

def unnormalize_data(ndata, stats):
    ndata = (ndata + 1) / 2
    data = ndata * (stats['max'] - stats['min']) + stats['min']
    return data

# dataset
class CuroboStateDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path,
                 pred_horizon, obs_horizon, action_horizon):

        # read from zarr dataset
        # dataset_root = zarr.open(dataset_path, 'r')

        # All demonstration episodes are concatinated in the first dimension N
        train_data = {
            # (N, action_dim)
            'action': dataset_root['data']['action'][:], # the pusht example gives (25650, 2)
            # (N, obs_dim)
            'obs': dataset_root['data']['state'][:] # the pusht example gives (25650, 5)
        }
        # Marks one-past the last index for each episode
        episode_ends = dataset_root['meta']['episode_ends'][:] # the pusht dataset has shape (206,)

        # compute start and end of each state-action sequence
        # also handles padding
        indices = create_sample_indices(
            episode_ends=episode_ends,
            sequence_length=pred_horizon,
            # add padding such that each timestep in the dataset are seen
            pad_before=obs_horizon-1,
            pad_after=action_horizon-1)

        # compute statistics and normalized data to [-1,1]
        stats = dict()
        normalized_train_data = dict()
        for key, data in train_data.items():
            stats[key] = get_data_stats(data)
            normalized_train_data[key] = normalize_data(data, stats[key])

        self.indices = indices
        self.stats = stats
        self.normalized_train_data = normalized_train_data
        self.pred_horizon = pred_horizon
        self.action_horizon = action_horizon
        self.obs_horizon = obs_horizon

    def __len__(self):
        # all possible segments of the dataset
        return len(self.indices)

    def __getitem__(self, idx):
        # get the start/end indices for this datapoint
        buffer_start_idx, buffer_end_idx, \
            sample_start_idx, sample_end_idx = self.indices[idx]

        # get nomralized data using these indices
        nsample = sample_sequence(
            train_data=self.normalized_train_data,
            sequence_length=self.pred_horizon,
            buffer_start_idx=buffer_start_idx,
            buffer_end_idx=buffer_end_idx,
            sample_start_idx=sample_start_idx,
            sample_end_idx=sample_end_idx
        )

        # discard unused observations
        nsample['obs'] = nsample['obs'][:self.obs_horizon,:]
        return nsample

In [22]:
# read from folder 
data_folder  = os.path.join(os.path.abspath('../interactpolicy/ipolicy/'), 'data', 'logged_plans_03')
data_files = sorted(os.listdir(data_folder))
dataset_root = dict()
episode_end = 0
episode_ends = []
action_list = []
state_list = []

for file in tqdm(data_files):
    if '.txt' in file:
        continue
    plan = np.load(os.path.join(data_folder, file))
    plan = np.concatenate([plan[0::10], plan[[-1]]], axis=0) # select every 10th step, making sure the last step is included
    next_plan = np.concatenate([plan[1:], plan[[-1]]], axis=0)
    
    action = np.concatenate([next_plan[:, 10:13], next_plan[:, 8:10]], axis=1) # end effector position and gripper state
    state = np.concatenate([plan[:, 10:13], plan[:, 8:10], plan[:, 13:16], plan[:, 20:23], plan[:, 27:30], plan[:, 34:37], 
                            plan[:, 41:44], plan[:, 48:51]], axis=1) # ee pos, gripper, red, green, blue, yellow, cyan, magenta
    action_list.append(action)
    state_list.append(state)
    episode_end += len(plan)
    episode_ends.append(episode_end) 

data_root = zarr.open_group('data/logged_plans_03.zarr', mode='w')
data = data_root.create_group('data')
data.create_dataset('action', data=np.concatenate(action_list, axis=0))
data.create_dataset('state', data=np.concatenate(state_list, axis=0))
meta = data_root.create_group('meta')
meta.create_dataset('episode_ends', data=np.array(episode_ends))


100%|██████████| 2622/2622 [00:00<00:00, 7920.67it/s]


AttributeError: 

AttributeError: 'Tree' object has no attribute '_ipython_display_'

/
 ├── data
 │   ├── action (395283, 5) float64
 │   └── state (395283, 23) float64
 └── meta
     └── episode_ends (2621,) int64

In [11]:
mkdir get_data_stat

conda_environment_macos.yaml  eval_real_robot.py          ray_train_multirun.py
conda_environment_real.yaml   LICENSE                     README.md
conda_environment.yaml        [0m[01;34mmedia[0m/                      setup.py
demo_pusht.py                 Miniforge3-Linux-x86_64.sh  state_dp.ipynb
demo_real_robot.py            multirun_metrics.py         [01;34mtests[0m/
[01;34mdiffusion_policy[0m/             pyrightconfig.json          train.py
eval.py                       ray_exec.py
