To setup this experiment, we need to:

1. Modify the current environment to include the endgoal keypoints.
2. Modify the current policy to include the endgoal keypoints.
3. We will use the same dataset as the baseline, but we will add the endgoal keypoints to the dataset. We will exploit the caviat that the endgoal keypoints do not change in the baseline dataset.
4. Train a new policy on the collected data.
5. Evaluate the new policy.

NOTE the success performance is not expected to increase:
1. the endgoal keypoints are constant the baseline dataset.
2. the policy baseline already achieves good result.

The experiment is relevant to evaluate if the training process is affected by the privileged information.
-> What we could expect might be an increase in training stability or speed, as the policy will have access to the endgoal keypoints.

Reasons the experiment might fail:
1. Behaviour cloning is not based on understaning the task, but on imitating the expert policy.

Reasons the experiment might succeed:
1. The policy will have access to the endgoal keypoints, which might help to understand the task better.


In [5]:
from pathlib import Path
import json
import numpy as np
import torch
import imageio
import gym_pusht  # noqa: F401
import gymnasium as gym
import matplotlib.pyplot as plt

# Import the eval_policy function from the lerobot scripts
from lerobot.scripts.eval import eval_policy
from lerobot.configs.train import TrainPipelineConfig
from lerobot.configs.default import DatasetConfig, EvalConfig
from lerobot.configs.types import PolicyFeature, FeatureType, NormalizationMode
from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
from lerobot.common.envs.factory import make_env, make_env_config
from lerobot.common.policies.factory import make_policy
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset

In [9]:
# Environment Configuration ################################################################
env = gym.make(
    "gym_pusht/PushT-v0",
    obs_type="environment_state_agent_pos_privileged",
    max_episode_steps=300,
)
print("Environment observation space:", env.observation_space)
print("Environment action space:", env.action_space)



Environment observation space: Dict('agent_pos': Box(0.0, 512.0, (2,), float64), 'environment_state': Box(0.0, 512.0, (32,), float64))
Environment action space: Box(0.0, 512.0, (2,), float32)


  logger.deprecation(


In [18]:
observation, info = env.reset(seed=4120312)
print(f"Observation: {observation.keys()}")
print(f"Info: {info.keys()}")

Observation: dict_keys(['environment_state', 'agent_pos'])
Info: dict_keys(['pos_agent', 'vel_agent', 'block_pose', 'goal_pose', 'n_contacts', 'is_success'])


In [70]:
dataset_config = DatasetConfig(
    repo_id="lerobot/pusht_keypoints"
)

dataset = LeRobotDataset(repo_id="the-future-dev/pusht_keypoints_expanded")

Resolving data files:   0%|          | 0/206 [00:00<?, ?it/s]

DATASET REFACTORING:
<code>
dataset_config = DatasetConfig(
    repo_id="lerobot/pusht_keypoints"
)

dataset = LeRobotDataset(repo_id="lerobot/pusht_keypoints")

import torch
import numpy as np
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset

def create_privileged_dataset(original_dataset):
    """
    Creates a new dataset with expanded environment_state feature (32,) by combining
    environment_state and goal_state from the original dataset, preserving episode structure.
    """
    # Create a copy of the features dictionary with modified environment_state shape
    features = original_dataset.features.copy()
    features["observation.environment_state"] = {
        "dtype": "float32",
        "shape": (32,),  # Change from (16,) to (32,)
        "names": features["observation.environment_state"]["names"]  # Keep same names
    }
    
    # Create a new dataset
    new_repo_id = "the-future-dev/pusht_keypoints_expanded"
    new_dataset = LeRobotDataset.create(
        repo_id=new_repo_id,
        fps=original_dataset.fps,
        robot_type=original_dataset.meta.robot_type,
        features=features
    )
    
    # Get episode boundaries from the original dataset
    episode_data_index = original_dataset.episode_data_index
    episode_indices = list(range(len(episode_data_index["from"])))  # List of episode indices (0 to num_episodes-1)
    
    print(f"Copying and transforming data from {len(original_dataset)} frames across {len(episode_indices)} episodes...")
    
    # Process each episode
    for ep_idx in episode_indices:
        if ep_idx % 10 == 0:  # Progress update every 10 episodes
            print(f"Processing episode {ep_idx}/{len(episode_indices)}")
        
        # Get frame indices for this episode
        from_idx = episode_data_index["from"][ep_idx].item()
        to_idx = episode_data_index["to"][ep_idx].item()
        
        # Reset episode buffer for the new episode
        new_dataset.episode_buffer = new_dataset.create_episode_buffer(episode_index=ep_idx)
        
        # Process all frames in this episode
        for idx in range(from_idx, to_idx):
            item = original_dataset[idx]
            
            # Create a copy of the frame, excluding unnecessary keys but keeping task
            new_frame = {k: v for k, v in item.items() if k not in ['index', 'episode_index', 'frame_index', "task_index", "timestamp"]}
            if "task" not in new_frame and "task_index" in new_frame:
                # Map task_index back to task string if task is missing
                task_idx = new_frame["task_index"].item() if isinstance(new_frame["task_index"], (torch.Tensor, np.ndarray)) else new_frame["task_index"]
                new_frame["task"] = original_dataset.meta.tasks[task_idx]
            
            # Combine environment_state and goal_state
            env_state = item["observation.environment_state"]
            goal_state = np.array([224.18019485, 266.60660172, 245.39339828, 287.81980515,
                                   181.75378798, 351.45941546, 160.54058454, 330.24621202,
                                   213.57359313, 213.57359313, 298.42640687, 298.42640687,
                                   277.21320344, 319.63961031, 192.36038969, 234.78679656], dtype=np.float32)
            
            # Ensure env_state is a tensor with float32 dtype
            if not isinstance(env_state, torch.Tensor):
                env_state = torch.tensor(env_state, dtype=torch.float32)
            elif env_state.dtype != torch.float32:
                env_state = env_state.to(dtype=torch.float32)
            combined_state = torch.cat([env_state, torch.tensor(goal_state, dtype=torch.float32)])
            
            # Update the frame with the combined state
            new_frame["observation.environment_state"] = combined_state.numpy().astype(np.float32)
            
            # Fix scalar features to have shape (1,) with correct dtype
            scalar_features = ["next.reward", "next.success", "next.done"]
            for key in scalar_features:
                if key in new_frame:
                    expected_dtype = features[key]["dtype"] if isinstance(features[key], dict) else str(features[key])
                    if isinstance(new_frame[key], (np.ndarray, torch.Tensor)) and new_frame[key].shape == ():
                        value = new_frame[key].item()
                        if expected_dtype == "bool":
                            new_frame[key] = np.array([bool(value)], dtype=bool)
                        elif expected_dtype == "float32":
                            new_frame[key] = np.array([value], dtype=np.float32)
                    elif isinstance(new_frame[key], (int, float)):
                        if expected_dtype == "bool":
                            new_frame[key] = np.array([bool(new_frame[key])], dtype=bool)
                        elif expected_dtype == "float32":
                            new_frame[key] = np.array([new_frame[key]], dtype=np.float32)
            
            # Add the frame to the episode buffer
            new_dataset.add_frame(new_frame)
        
        new_dataset.save_episode()

    # Push to hub and reload the dataset
    print("Pushing dataset to hub...")
    new_dataset.push_to_hub(new_repo_id)
    print("Reloading dataset from hub...")
    new_dataset = LeRobotDataset(repo_id=new_repo_id)  # Load from hub after pushing
    
    print(f"Created new dataset with {len(new_dataset)} frames across {new_dataset.num_episodes} episodes")
    return new_dataset

# Run the function
privileged_dataset = create_privileged_dataset(dataset)

# Verify the transformation worked
sample = privileged_dataset[0]
print("Original environment state shape:", dataset[0]["observation.environment_state"].shape)
print("Privileged environment state shape:", sample["observation.environment_state"].shape)
</code>

In [2]:
output_directory = Path("../../outputs/eval/pusht_keypoints_privileged")
output_directory.mkdir(parents=True, exist_ok=True)

videos_dir = output_directory / "videos"
videos_dir.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
# Policy Configuration ################################################################
policy_config = DiffusionConfig(
    # I/O structure.
    n_obs_steps=2,
    horizon=16,
    n_action_steps=8,
    input_features={
        "observation.state": PolicyFeature(type=FeatureType.STATE, shape=(2,)),
        "observation.environment_state": PolicyFeature(type=FeatureType.ENV, shape=(32,))
    },
    output_features={
        "action": PolicyFeature(type=FeatureType.ACTION, shape=(2,))
    },
    normalization_mapping={
        "STATE": NormalizationMode.MIN_MAX,
        "ENV": NormalizationMode.MIN_MAX,
        "ACTION": NormalizationMode.MIN_MAX,
        "VISUAL": NormalizationMode.IDENTITY,
    },
    
    # Architecture.
    # State encoder parameters
    state_backbone="MLP",
    state_encoder_block_channels=[64, 128],
    state_encoder_feature_dim=256,
    state_encoder_use_layernorm=True,
    # Unet. => Default
    # Noise scheduler.
    noise_scheduler_type="DDPM",
    beta_schedule="squaredcos_cap_v2",
    beta_start=0.0001,
    beta_end=0.02,
    num_train_timesteps=100,
    prediction_type="epsilon",

    # Training hyperparameters
    # optimizer_lr=1e-4,
    # optimizer_betas=(0.95, 0.999),
    # optimizer_eps=1e-8,
    # optimizer_weight_decay=1e-6,
    # scheduler_name="cosine",
    # scheduler_warmup_steps=500,
    device=device,
)


In [71]:
policy = make_policy(policy_config, ds_meta=dataset.meta)
print(type(policy))

<class 'lerobot.common.policies.diffusion.modeling_diffusion.DiffusionPolicy'>


Ensure the initialized policy can play in the environment before starting training.


In [75]:
# SINGLE ENVIRONMENT ROLLOUT #################################################################

policy.reset()
numpy_observation, info = env.reset(seed=42)

# Prepare to collect every rewards and all the frames of the episode,
# from initial state to final state.
rewards = []
frames = []

# Render frame of the initial state
frames.append(env.render())

step = 0
done = False

while not done:
    state = torch.from_numpy(numpy_observation["agent_pos"])
    environment_state = torch.from_numpy(numpy_observation["environment_state"])

    state = state.to(torch.float32).to(device, non_blocking=True)
    environment_state = environment_state.to(torch.float32).to(device, non_blocking=True)

    # Batch dimension
    state = state.unsqueeze(0)
    environment_state = environment_state.unsqueeze(0)

    policy_input = {
        "observation.state": state,
        "observation.environment_state": environment_state
    }

    with torch.inference_mode():
        action = policy.select_action(policy_input)

    numpy_action = action.squeeze(0).to("cpu").numpy()

    numpy_observation, reward, terminated, truncated, info = env.step(numpy_action)
    # print(f"{step=} {reward=} {terminated=}")

    rewards.append(reward)
    frames.append(env.render())

    done = terminated | truncated | done
    step += 1

if terminated:
    print("Success!")
else:
    print("Failure!")

# Get the speed of environment (i.e. its number of frames per second).
fps = env.metadata["render_fps"]

# Encode all frames into a mp4 video.
video_path = output_directory / "single_eval_rollout.mp4"
imageio.mimsave(str(video_path), np.stack(frames), fps=fps)

print(f"Video of the evaluation is available in '{video_path}'.")


Failure!




Video of the evaluation is available in '../../outputs/eval/pusht_keypoints_privileged/single_eval_rollout.mp4'.


In [76]:
del policy
torch.cuda.empty_cache()

: 