In [None]:
import numpy as np
np.bool = np.bool_ # bad trick to fix numpy version issue :(
import os
import sys
from natsort import natsorted

sys.path = [p for p in sys.path if '/peract/' not in p]

# Set `PYOPENGL_PLATFORM=egl` for pyrender visualizations
os.environ["DISPLAY"] = ":0"
os.environ["PYOPENGL_PLATFORM"] = "egl"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3" # Depends on your computer and available GPUs

In [None]:
### Depending on your workspace, you may already have this repository installe, otherwise clone once again
if not os.path.exists(os.path.join(os.getcwd(), 'peract_colab')):
    !git clone https://github.com/yuki1003/peract_colab.git

!cd peract_colab && git pull origin master

In [None]:
# Check if modules in peract_colab repository are recognized.
try: # test import
    from rlbench.utils import get_stored_demo
except ImportError as error_message:
    print(error_message)
    print("Adding peract_colab repository to system path.")
    sys.path.append('peract_colab')

In [None]:
import shutil

import torch
import clip

from arm.replay_buffer import create_replay, fill_replay, uniform_fill_replay, fill_replay_copy_with_crop_from_approach
from yarr.replay_buffer.wrappers.pytorch_replay_buffer import PyTorchReplayBuffer

In [None]:
## STATIC VALUES USED IN BELOW FUNCTION: SETTING THEM AS GLOBAL FOR FURTHER USE

#___DATA___
TASK = 'handing_over_banana'

# Data Constants
WORKSPACE_DIR = os.getcwd()
DATA_FOLDER = os.path.join(WORKSPACE_DIR, "task_data", "handoversim")
EPISODES_FOLDER = os.path.join(TASK, "all_variations", "episodes")

EPISODE_FOLDER = 'episode%d'
SETUP = "s1" # Options: "s1"
train_data_path = os.path.join(DATA_FOLDER, f"train_{SETUP}", EPISODES_FOLDER)
TRAIN_INDEXES = [int(episode_nr.replace("episode", "")) for episode_nr in natsorted(os.listdir(train_data_path))][:3]
test_data_path = os.path.join(DATA_FOLDER, f"val_{SETUP}", EPISODES_FOLDER)
TEST_INDEXES = [int(episode_nr.replace("episode", "")) for episode_nr in natsorted(os.listdir(test_data_path))][:3]

print(f"TRAIN | Total #: {len(TRAIN_INDEXES)}, indices: {TRAIN_INDEXES}")
print(f"TEST | Total #: {TEST_INDEXES}")

# Replaybuffer related constants
LOW_DIM_SIZE = 4    # 4 dimensions - proprioception: {gripper_open, left_finger_joint, right_finger_joint, timestep}
IMAGE_SIZE =  128  # 128x128 - if you want to use higher voxel resolutions like 200^3, you might want to regenerate the dataset with larger images
DEMO_AUGMENTATION_EVERY_N = 10 # Only select every n-th frame to use for replaybuffer from demo
ROTATION_RESOLUTION = 5 # degree increments per axis
TARGET_OBJ_KEYPOINTS=False # Real - (changed later)
TARGET_OBJ_USE_LAST_KP=False # Real - (changed later)
TARGET_OBJ_IS_AVAIL = True # HandoverSim - (changed later)

DEPTH_SCALE = 1000
STOPPING_DELTA = 0.001
SCENE_BOUNDS = [0.11, -0.5, 0.8, 1.11, 0.5, 1.8]  # Must be 1m each

# Training Settings Constants
BATCH_SIZE = 1
VOXEL_SIZES = [100]  # 100x100x100 voxels

In [None]:
def inspect_peract_agent(settings):

    # BATCH SETTINGS
    FILL_REPLAY_SETTING = settings['fill_replay_setting']
    CAMERAS = settings['cameras']
    USE_APPROACH = settings['keypoint_approach']

    # Summary of run properties
    print("\nExperiment Setup")
    print(f"Task: {TASK} - SETUP: {SETUP} - Cameras: {len(CAMERAS)}")
    print("Run Properties")
    print(f"Fill replay setting: {FILL_REPLAY_SETTING}")

    #___REPLAY-BUFFER___
    train_replay_storage_dir = os.path.join(WORKSPACE_DIR,'replay_train')
    if os.path.exists(train_replay_storage_dir):
        print(f"Emptying {train_replay_storage_dir}")
        shutil.rmtree(train_replay_storage_dir)
    if not os.path.exists(train_replay_storage_dir):
        print(f"Could not find {train_replay_storage_dir}, creating directory.")
        os.mkdir(train_replay_storage_dir)

    test_replay_storage_dir = os.path.join(WORKSPACE_DIR,'replay_test')
    if os.path.exists(test_replay_storage_dir):
        print(f"Emptying {test_replay_storage_dir}")
        shutil.rmtree(test_replay_storage_dir)
    if not os.path.exists(test_replay_storage_dir):
        print(f"Could not find {test_replay_storage_dir}, creating directory.")
        os.mkdir(test_replay_storage_dir)

    train_replay_buffer = create_replay(batch_size=BATCH_SIZE,
                                        timesteps=1,
                                        save_dir=train_replay_storage_dir,
                                        cameras=CAMERAS,
                                        voxel_sizes=VOXEL_SIZES,
                                        image_size=IMAGE_SIZE,
                                        low_dim_size=LOW_DIM_SIZE)

    test_replay_buffer = create_replay(batch_size=BATCH_SIZE,
                                    timesteps=1,
                                    save_dir=test_replay_storage_dir,
                                    cameras=CAMERAS,
                                    voxel_sizes=VOXEL_SIZES,
                                    image_size=IMAGE_SIZE,
                                    low_dim_size=LOW_DIM_SIZE)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    clip_model, preprocess = clip.load("RN50", device=device) # CLIP-ResNet50

    print("-- Train Buffer --")
    if FILL_REPLAY_SETTING.lower() == "uniform":
        uniform_fill_replay(
            data_path=train_data_path,
            episode_folder=EPISODE_FOLDER,
            replay=train_replay_buffer,
            # start_idx=0,
            # num_demos=NUM_DEMOS,
            d_indexes=TRAIN_INDEXES,
            demo_augmentation=True,
            demo_augmentation_every_n=DEMO_AUGMENTATION_EVERY_N,
            cameras=CAMERAS,
            rlbench_scene_bounds=SCENE_BOUNDS,
            voxel_sizes=VOXEL_SIZES,
            rotation_resolution=ROTATION_RESOLUTION,
            crop_augmentation=False,
            depth_scale=DEPTH_SCALE,
            use_approach=USE_APPROACH,
            approach_distance=0.3,
            stopping_delta=STOPPING_DELTA,
            target_obj_keypoint=TARGET_OBJ_KEYPOINTS,
            target_obj_use_last_kp=TARGET_OBJ_USE_LAST_KP,
            target_obj_is_avail=TARGET_OBJ_IS_AVAIL,
            clip_model=clip_model,
            device=device,
            )
    elif FILL_REPLAY_SETTING.lower() == "crop":
        fill_replay_copy_with_crop_from_approach(
            data_path=train_data_path,
            episode_folder=EPISODE_FOLDER,
            replay=train_replay_buffer,
            # start_idx=0,
            # num_demos=NUM_DEMOS,
            d_indexes=TRAIN_INDEXES,
            demo_augmentation=True,
            demo_augmentation_every_n=DEMO_AUGMENTATION_EVERY_N,
            cameras=CAMERAS,
            rlbench_scene_bounds=SCENE_BOUNDS,
            voxel_sizes=VOXEL_SIZES,
            rotation_resolution=ROTATION_RESOLUTION,
            crop_augmentation=False,
            depth_scale=DEPTH_SCALE,
            use_approach=USE_APPROACH,
            approach_distance=0.3,
            stopping_delta=STOPPING_DELTA,
            target_obj_keypoint=TARGET_OBJ_KEYPOINTS,
            target_obj_use_last_kp=TARGET_OBJ_USE_LAST_KP,
            target_obj_is_avail=TARGET_OBJ_IS_AVAIL,
            clip_model=clip_model,
            device=device,
            )
    elif FILL_REPLAY_SETTING.lower() == "standard":
        fill_replay(
            data_path=train_data_path,
            episode_folder=EPISODE_FOLDER,
            replay=train_replay_buffer,
            # start_idx=0,
            # num_demos=NUM_DEMOS,
            d_indexes=TRAIN_INDEXES,
            demo_augmentation=True,
            demo_augmentation_every_n=DEMO_AUGMENTATION_EVERY_N,
            cameras=CAMERAS,
            rlbench_scene_bounds=SCENE_BOUNDS,
            voxel_sizes=VOXEL_SIZES,
            rotation_resolution=ROTATION_RESOLUTION,
            crop_augmentation=False,
            depth_scale=DEPTH_SCALE,
            use_approach=USE_APPROACH,
            approach_distance=0.3,
            stopping_delta=STOPPING_DELTA,
            target_obj_keypoint=TARGET_OBJ_KEYPOINTS,
            target_obj_use_last_kp=TARGET_OBJ_USE_LAST_KP,
            target_obj_is_avail=TARGET_OBJ_IS_AVAIL,
            clip_model=clip_model,
            device=device,
            )
    else:
        raise ValueError("Unkown setting for fill replay buffer")

        
    print("-- Test Buffer --")
    if FILL_REPLAY_SETTING.lower() == "uniform":
        uniform_fill_replay(
            data_path=test_data_path,
            episode_folder=EPISODE_FOLDER,
            replay=test_replay_buffer,
            # start_idx=start_idx,
            # num_demos=num_demos,
            d_indexes=TEST_INDEXES,
            demo_augmentation=True,
            demo_augmentation_every_n=DEMO_AUGMENTATION_EVERY_N,
            cameras=CAMERAS,
            rlbench_scene_bounds=SCENE_BOUNDS,
            voxel_sizes=VOXEL_SIZES,
            rotation_resolution=ROTATION_RESOLUTION,
            crop_augmentation=False,
            depth_scale=DEPTH_SCALE,
            use_approach=USE_APPROACH,
            approach_distance=0.3,
            stopping_delta=STOPPING_DELTA,
            target_obj_keypoint=TARGET_OBJ_KEYPOINTS,
            target_obj_use_last_kp=TARGET_OBJ_USE_LAST_KP,
            target_obj_is_avail=TARGET_OBJ_IS_AVAIL,
            clip_model=clip_model,
            device=device,
            )
    elif FILL_REPLAY_SETTING.lower() == "crop":
        fill_replay_copy_with_crop_from_approach(
            data_path=test_data_path,
            episode_folder=EPISODE_FOLDER,
            replay=test_replay_buffer,
            # start_idx=start_idx,
            # num_demos=num_demos,
            d_indexes=TEST_INDEXES,
            demo_augmentation=True,
            demo_augmentation_every_n=DEMO_AUGMENTATION_EVERY_N,
            cameras=CAMERAS,
            rlbench_scene_bounds=SCENE_BOUNDS,
            voxel_sizes=VOXEL_SIZES,
            rotation_resolution=ROTATION_RESOLUTION,
            crop_augmentation=False,
            depth_scale=DEPTH_SCALE,
            use_approach=USE_APPROACH,
            approach_distance=0.3,
            stopping_delta=STOPPING_DELTA,
            target_obj_keypoint=TARGET_OBJ_KEYPOINTS,
            target_obj_use_last_kp=TARGET_OBJ_USE_LAST_KP,
            target_obj_is_avail=TARGET_OBJ_IS_AVAIL,
            clip_model=clip_model,
            device=device,
            )
    elif FILL_REPLAY_SETTING.lower() == "standard":
        fill_replay(
            data_path=test_data_path,
            episode_folder=EPISODE_FOLDER,
            replay=test_replay_buffer,
            # start_idx=start_idx,
            # num_demos=num_demos,
            d_indexes=TEST_INDEXES,
            demo_augmentation=True,
            demo_augmentation_every_n=DEMO_AUGMENTATION_EVERY_N,
            cameras=CAMERAS,
            rlbench_scene_bounds=SCENE_BOUNDS,
            voxel_sizes=VOXEL_SIZES,
            rotation_resolution=ROTATION_RESOLUTION,
            crop_augmentation=False,
            depth_scale=DEPTH_SCALE,
            use_approach=USE_APPROACH,
            approach_distance=0.3,
            stopping_delta=STOPPING_DELTA,
            target_obj_keypoint=TARGET_OBJ_KEYPOINTS,
            target_obj_use_last_kp=TARGET_OBJ_USE_LAST_KP,
            target_obj_is_avail=TARGET_OBJ_IS_AVAIL,
            clip_model=clip_model,
            device=device,
            )
    else:
        raise ValueError("Unkown setting for fill replay buffer")


    # delete the CLIP model since we have already extracted language features
    del clip_model

    # wrap buffer with PyTorch dataset and make iterator
    train_wrapped_replay = PyTorchReplayBuffer(train_replay_buffer)
    train_dataset = train_wrapped_replay.dataset()
    train_data_iter = iter(train_dataset)

    test_wrapped_replay = PyTorchReplayBuffer(test_replay_buffer)
    test_dataset = test_wrapped_replay.dataset()
    test_data_iter = iter(test_dataset)

    return train_data_iter, test_data_iter

In [None]:
import itertools

available_cameras = [f"view_{camera_i}" for camera_i in range(3)]
# Grid search
# grid = {
#     'fill_replay_setting': ["", ""],
#     'cameras': [available_cameras],
#     'RGB_AUGMENTATION': ['None'],
#     'keypoint_approach': [True, False]
# }

grid = {
    'fill_replay_setting': ["crop"],
    'cameras': [available_cameras],
    'RGB_AUGMENTATION': ['None'],
    'keypoint_approach': [True]
}
# Loop over al grid search combinations
counter = 0
lst_settings = []
for values in itertools.product(*grid.values()):
    
    point = dict(zip(grid.keys(), values))
    # merge the general settings
    settings = {**point}
    lst_settings.append(settings)
    print(counter, settings)
    
    counter += 1

In [None]:
CHOSEN_SETTING = 0
settings = lst_settings[CHOSEN_SETTING]

# FILL_REPLAY_SETTING = settings['fill_replay_uniform']
# CAMERAS = settings['cameras']
# USE_APPROACH = settings['keypoint_approach']

device = "cuda" if torch.cuda.is_available() else "cpu"

train_data_iter, test_data_iter = inspect_peract_agent(lst_settings[CHOSEN_SETTING])

In [None]:
from matplotlib import pyplot as plt

import torch
import torchvision.transforms as T

from agent.utils import _preprocess_inputs
from agent.voxel_grid import VoxelGrid
from arm.utils import get_gripper_render_pose, visualise_voxel_video

## First find analysis
analyzed_episode = 46

lang_goals = set()
all_frames = set()
for i in range(1000):
    # sample from dataset
    batch = next(train_data_iter)
    lang_goal = batch['lang_goal'][0][0][0]
    task, episode, frame, kp = lang_goal.split('-')
    if analyzed_episode == int(episode.replace('episode_', '')):
        lang_goals.add(lang_goal)
        all_frames.add(frame)

replay_buffer = natsorted(lang_goals)
all_frames = natsorted(all_frames)

print(replay_buffer, all_frames)

In [None]:
import cv2

# Directory to store the frames
output_dir = "temp_frames"
os.makedirs(output_dir, exist_ok=True)

# Video parameters
fps = 10
print(settings)
output_video = f"episode_{analyzed_episode}-uniform_{settings['fill_replay_setting']}-approach_{settings['keypoint_approach']}.mp4"
print(output_video)
video_writer = None

# initialize voxelizer
vox_grid = VoxelGrid(
    coord_bounds=SCENE_BOUNDS,
    voxel_size=VOXEL_SIZES[0],
    device=device,
    batch_size=BATCH_SIZE,
    feature_size=3,
    max_num_coords=np.prod([IMAGE_SIZE, IMAGE_SIZE]) * len(settings["cameras"]),
)

# metric scene bounds
bounds = torch.tensor(SCENE_BOUNDS,device=device).unsqueeze(0)

for analyzed_frame in all_frames: # Loop through all available frames of the replay buffer
    frame_language_goals = []
    for replay_buffer_i in replay_buffer:
        if analyzed_frame in replay_buffer_i:
            frame_language_goals.append(replay_buffer_i) # Select ones from replay buffer matching frame number

    frame_language_goal = frame_language_goals[0] # Use as foundation
    while True:
        batch = next(train_data_iter)
        lang_goal = batch['lang_goal'][0][0][0]
        if lang_goal == frame_language_goal:
            batch = {k: v.to(device) for k, v in batch.items() if type(v) == torch.Tensor}
            # sample
            action_trans = batch['trans_action_indicies'][:, -1, :3].int()
            action_rot_grip = batch['rot_grip_action_indicies'][:, -1].int()
            action_gripper_pose = batch['gripper_pose'][:, -1]
            break

    if len(frame_language_goals) > 1: # If frame with second keypoint/action available, find!
        frame_language_goal = frame_language_goals[1]
        while True:
            batch_temp = next(train_data_iter)
            lang_goal = batch_temp['lang_goal'][0][0][0]
            
            if lang_goal == frame_language_goal:
                batch_temp = {k: v.to(device) for k, v in batch_temp.items() if type(v) == torch.Tensor}
                next_action_gripper_pose = batch_temp['gripper_pose'][:, -1]
                # sample
                action_gripper_pose = torch.cat([action_gripper_pose, next_action_gripper_pose], dim=0)
                break

    # preprocess observations
    rgbs_pcds, _ = _preprocess_inputs(batch, settings["cameras"])
    pcds = [rp[1] for rp in rgbs_pcds]

    # batch_size
    bs = rgbs_pcds[0][0].shape[0]

    # identity matrix
    identity_4x4 = torch.eye(4).unsqueeze(0).repeat(bs, 1, 1).to(device=device)

    # flatten observations
    pcd_flat = torch.cat([p.permute(0, 2, 3, 1).reshape(bs, -1, 3) for p in pcds], 1)
    rgb = [rp[0] for rp in rgbs_pcds] # Loop per camera
    feat_size = rgb[0].shape[1]
    flat_imag_features = torch.cat(
        [p.permute(0, 2, 3, 1).reshape(bs, -1, feat_size) for p in rgb], 1)

    # voxelize!
    voxel_grid = vox_grid.coords_to_bounding_voxel_grid(pcd_flat,
                                                        flat_imag_features,
                                                        coord_bounds=bounds)
    # swap to channels fist
    voxel_grid = voxel_grid.permute(0, 4, 1, 2, 3).detach().cpu().numpy()

    # expert action voxel indicies and coord
    coords_indicies = action_trans
    # discrete to continuous
    continuous_trans = action_gripper_pose[:,:3].detach().cpu().numpy()
    continuous_quat = action_gripper_pose[:,3:].detach().cpu().numpy()

    # gripper visualization pose
    voxel_size = 0.045
    voxel_scale = voxel_size * 100
    gripper_pose_mat = []
    for continuous_trans_i, continuous_quat_i in zip(continuous_trans, continuous_quat):
        gripper_pose_mat_i = get_gripper_render_pose(voxel_scale,
                                                SCENE_BOUNDS[:3],
                                                continuous_trans_i,
                                                continuous_quat_i)
        gripper_pose_mat.append(gripper_pose_mat_i)

    gripper_pose_mat = np.squeeze(np.array(gripper_pose_mat))

    rendered_img_0 = visualise_voxel_video(voxel_grid[0],
                                None,
                                None,
                                coords_indicies[0],
                                highlight_alpha=1.0,
                                voxel_size=voxel_size,
                                rotation_amount=np.deg2rad(0),
                                render_gripper=True,
                                gripper_pose=gripper_pose_mat,
                                gripper_mesh_scale=voxel_scale,
                                perspective = False)

    rendered_img_90 = visualise_voxel_video(voxel_grid[0],
                                None,
                                None,
                                coords_indicies[0],
                                highlight_alpha=1.0,
                                voxel_size=voxel_size,
                                rotation_amount=np.deg2rad(90),
                                render_gripper=True,
                                gripper_pose=gripper_pose_mat,
                                gripper_mesh_scale=voxel_scale,
                                perspective = False)

    rendered_img_180 = visualise_voxel_video(voxel_grid[0],
                                None,
                                None,
                                coords_indicies[0],
                                highlight_alpha=1.0,
                                voxel_size=voxel_size,
                                rotation_amount=np.deg2rad(180),
                                render_gripper=True,
                                gripper_pose=gripper_pose_mat,
                                gripper_mesh_scale=voxel_scale,
                                perspective = False)

    rendered_img_270 = visualise_voxel_video(voxel_grid[0],
                                None,
                                None,
                                coords_indicies[0],
                                highlight_alpha=1.0,
                                voxel_size=voxel_size,
                                rotation_amount=np.deg2rad(270),
                                render_gripper=True,
                                gripper_pose=gripper_pose_mat,
                                gripper_mesh_scale=voxel_scale,
                                perspective = False)

    rendered_img_0_persp = visualise_voxel_video(voxel_grid[0],
                                None,
                                None,
                                coords_indicies[0],
                                highlight_alpha=1.0,
                                voxel_size=voxel_size,
                                rotation_amount=np.deg2rad(0),
                                render_gripper=True,
                                gripper_pose=gripper_pose_mat,
                                gripper_mesh_scale=voxel_scale)

    rendered_img_side_persp = visualise_voxel_video(voxel_grid[0],
                                None,
                                None,
                                coords_indicies[0],
                                highlight_alpha=1.0,
                                voxel_size=voxel_size,
                                rotation_amount=np.deg2rad(45),
                                render_gripper=True,
                                gripper_pose=gripper_pose_mat,
                                gripper_mesh_scale=voxel_scale)

                    

    fig = plt.figure(figsize=(20, 15))
    fig.add_subplot(3, 2, 1)
    plt.imshow(rendered_img_0)
    plt.title("0-degree view")
    fig.add_subplot(3, 2, 2)
    plt.imshow(rendered_img_90)
    plt.title("90-degree view")
    fig.add_subplot(3, 2, 3)
    plt.imshow(rendered_img_180)
    plt.title("180-degree view")
    fig.add_subplot(3, 2, 4)
    plt.imshow(rendered_img_270)
    plt.title("270-degree view")
    fig.add_subplot(3, 2, 5)
    plt.imshow(rendered_img_0_persp)
    plt.axis('off')
    plt.title("00-degree view")
    fig.add_subplot(3, 2, 6)
    plt.imshow(rendered_img_side_persp)
    plt.axis('off')
    plt.title("side view")

    # plt.show()

    # Save the figure as an image
    frame_path = os.path.join(output_dir, f"frame_{analyzed_frame}.png")
    plt.savefig(frame_path)
    plt.close(fig)

    # Add the frame to the video
    if video_writer is None:
        # Initialize video writer with the first frame's dimensions
        sample_frame = cv2.imread(frame_path)
        height, width, _ = sample_frame.shape
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
        video_writer = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
    video_writer.write(cv2.imread(frame_path))

# Release the video writer and cleanup
if video_writer:
    video_writer.release()

# Remove temporary frames directory
for file in os.listdir(output_dir):
    os.remove(os.path.join(output_dir, file))
os.rmdir(output_dir)

print(f"Video saved as {output_video}")