In [1]:
import numpy as np
np.bool = np.bool_ # bad trick to fix numpy version issue :(
import os
import sys
from natsort import natsorted


sys.path = [p for p in sys.path if '/peract/' not in p]

# Set `PYOPENGL_PLATFORM=egl` for pyrender visualizations
os.environ["DISPLAY"] = ":0"
os.environ["PYOPENGL_PLATFORM"] = "egl"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3" # Depends on your computer and available GPUs

In [2]:
## DISPLAY ALL SETTINGS
import itertools


available_cameras = [f"view_{camera_i}" for camera_i in range(3)]
# Grid search
# grid = {
#     'fill_replay_setting': ["", ""],
#     'cameras': [available_cameras],
#     'keypoint_approach': [True, False]
# }

grid = {
    'fill_replay_setting': ["standard"],
    'cameras': [available_cameras, [available_cameras[0]]],
    'keypoint_approach': [True],
    'demo_augm_n': [5] # Not used for fill_replay_setting == "uniform"
}
# Loop over al grid search combinations
counter = 0
lst_settings = []
for values in itertools.product(*grid.values()):
    
    point = dict(zip(grid.keys(), values))
    # merge the general settings
    settings = {**point}
    lst_settings.append(settings)
    print(counter, settings)
    
    counter += 1

0 {'fill_replay_setting': 'standard', 'cameras': ['view_0', 'view_1', 'view_2'], 'keypoint_approach': True, 'demo_augm_n': 5}
1 {'fill_replay_setting': 'standard', 'cameras': ['view_0'], 'keypoint_approach': True, 'demo_augm_n': 5}


In [3]:
## CHOOSE SETTING TO VISUALIZE (CREATE VIDEO OUT OF)
from notebook_helpers.build_replay import load_replay_buffer
from notebook_helpers.constants import * # Load global constant variables from constants.py

CHOSEN_SETTING = 0
settings = lst_settings[CHOSEN_SETTING]

# FILL_REPLAY_SETTING = settings['fill_replay_uniform']
# CAMERAS = settings['cameras']
# USE_APPROACH = settings['keypoint_approach']

device = "cuda" if torch.cuda.is_available() else "cpu"

train_data_iter, test_data_iter = load_replay_buffer(lst_settings[CHOSEN_SETTING])


Experiment Setup
Task: handing_over_banana - SETUP: s1 - Cameras: 3
Run Properties
Fill replay setting: standard - DEMO_AUGM_N: 5
Emptying /home/ywatabe/Projects/PerAct/replay_train
Could not find /home/ywatabe/Projects/PerAct/replay_train, creating directory.
Emptying /home/ywatabe/Projects/PerAct/replay_test
Could not find /home/ywatabe/Projects/PerAct/replay_test, creating directory.
-- Train Buffer --
Filling replay ...
Filling demo 45
Found 2 keypoints: [34, 58], w/ approach distance [0.2 - 0.42]: 0.301
Using available
0 [34]
5 [34]
10 [34]
15 [34]
20 [34]
Filling demo 46
Found 2 keypoints: [35, 61], w/ approach distance [0.18 - 0.4]: 0.302
Using available
0 [35]
5 [35]
10 [35]
15 [35]
20 [35]
25 [35]
Filling demo 47
Found 2 keypoints: [29, 60], w/ approach distance [0.15 - 0.36]: 0.299
Using available
0 [29]
5 [29]
10 [29]
15 [29]
Filling demo 48
Found 2 keypoints: [28, 60], w/ approach distance [0.13 - 0.36]: 0.299
Using available
0 [28]
5 [28]
10 [28]
15 [28]
Filling demo 49
F

In [None]:
## COLLECT ALL FRAMES USED FOR ANALYSIS

from matplotlib import pyplot as plt

import torch
import torchvision.transforms as T

from agent.utils import _preprocess_inputs
from agent.voxel_grid import VoxelGrid
from arm.utils import get_gripper_render_pose, visualise_voxel_video

## First find analysis
analyzed_episode = 46

lang_goals = set()
all_frames = set()
for i in range(1000):
    # sample from dataset
    batch = next(train_data_iter)
    lang_goal = batch['lang_goal'][0][0][0]
    task, episode, frame, kp = lang_goal.split('-')
    if analyzed_episode == int(episode.replace('episode_', '')):
        lang_goals.add(lang_goal)
        all_frames.add(frame)

replay_buffer = natsorted(lang_goals)
all_frames = natsorted(all_frames)

print(replay_buffer, all_frames)

In [None]:
## CREATE VIDEO USING COLLECTED FRAMES

import imageio


# Video parameters
print(settings)
if settings.get('fill_replay_setting', None) in ["crop", "standard"]:
    video_output_path = f"episode_{analyzed_episode}-fill_replay_{settings['fill_replay_setting']}-skip_{settings['demo_augm_n']}-approach_{settings['keypoint_approach']}.mp4"
elif settings.get('fill_replay_setting', None) in ["uniform"]:
    video_output_path = f"episode_{analyzed_episode}-fill_replay_{settings['fill_replay_setting']}-sample_{5}-approach_{settings['keypoint_approach']}.mp4" # NOTE: Hardcoded setting
else:
    raise ValueError("Unkown parameter for settings['fill_replay_setting']. Cannot analyze input data.")
# video_output_path = f"TEST_episode_{analyzed_episode}-fill_replay_{settings['fill_replay_setting']}-skip_{settings['demo_augm_n']}-approach_only.mp4"

# initialize voxelizer
vox_grid = VoxelGrid(
    coord_bounds=SCENE_BOUNDS,
    voxel_size=VOXEL_SIZES[0],
    device=device,
    batch_size=BATCH_SIZE,
    feature_size=3,
    max_num_coords=np.prod([IMAGE_SIZE, IMAGE_SIZE]) * len(settings["cameras"]),
)

# metric scene bounds
bounds = torch.tensor(SCENE_BOUNDS,device=device).unsqueeze(0)

# Open a video writer
with imageio.get_writer(video_output_path, fps=10) as video_writer:
    for analyzed_frame in all_frames: # Loop through all available frames of the replay buffer
        frame_language_goals = []
        for replay_buffer_i in replay_buffer:
            if analyzed_frame in replay_buffer_i:
                frame_language_goals.append(replay_buffer_i) # Select ones from replay buffer matching frame number

        frame_language_goal = frame_language_goals[0] # Use as foundation
        while True:
            batch = next(train_data_iter)
            lang_goal = batch['lang_goal'][0][0][0]
            if lang_goal == frame_language_goal:
                batch = {k: v.to(device) for k, v in batch.items() if type(v) == torch.Tensor}
                # sample
                action_trans = batch['trans_action_indicies'][:, -1, :3].int()
                action_rot_grip = batch['rot_grip_action_indicies'][:, -1].int()
                action_gripper_pose = batch['gripper_pose'][:, -1]
                break

        if len(frame_language_goals) > 1: # If frame with second keypoint/action available, find!
            frame_language_goal = frame_language_goals[1]
            while True:
                batch_temp = next(train_data_iter)
                lang_goal = batch_temp['lang_goal'][0][0][0]
                
                if lang_goal == frame_language_goal:
                    batch_temp = {k: v.to(device) for k, v in batch_temp.items() if type(v) == torch.Tensor}
                    next_action_gripper_pose = batch_temp['gripper_pose'][:, -1]
                    # sample
                    action_gripper_pose = torch.cat([action_gripper_pose, next_action_gripper_pose], dim=0)
                    break

        # preprocess observations
        rgbs_pcds, _ = _preprocess_inputs(batch, settings["cameras"])
        pcds = [rp[1] for rp in rgbs_pcds]

        # batch_size
        bs = rgbs_pcds[0][0].shape[0]

        # identity matrix
        identity_4x4 = torch.eye(4).unsqueeze(0).repeat(bs, 1, 1).to(device=device)

        # flatten observations
        pcd_flat = torch.cat([p.permute(0, 2, 3, 1).reshape(bs, -1, 3) for p in pcds], 1)
        rgb = [rp[0] for rp in rgbs_pcds] # Loop per camera
        feat_size = rgb[0].shape[1]
        flat_imag_features = torch.cat(
            [p.permute(0, 2, 3, 1).reshape(bs, -1, feat_size) for p in rgb], 1)

        # voxelize!
        voxel_grid = vox_grid.coords_to_bounding_voxel_grid(pcd_flat,
                                                            flat_imag_features,
                                                            coord_bounds=bounds)
        # swap to channels fist
        voxel_grid = voxel_grid.permute(0, 4, 1, 2, 3).detach().cpu().numpy()

        # expert action voxel indicies and coord
        coords_indicies = action_trans
        # discrete to continuous
        continuous_trans = action_gripper_pose[:,:3].detach().cpu().numpy()
        continuous_quat = action_gripper_pose[:,3:].detach().cpu().numpy()

        # gripper visualization pose
        voxel_size = 0.045
        voxel_scale = voxel_size * 100
        gripper_pose_mat = []
        for continuous_trans_i, continuous_quat_i in zip(continuous_trans, continuous_quat):
            gripper_pose_mat_i = get_gripper_render_pose(voxel_scale,
                                                    SCENE_BOUNDS[:3],
                                                    continuous_trans_i,
                                                    continuous_quat_i)
            gripper_pose_mat.append(gripper_pose_mat_i)

        gripper_pose_mat = np.squeeze(np.array(gripper_pose_mat))

        rendered_img_0 = visualise_voxel_video(voxel_grid[0],
                                    None,
                                    None,
                                    coords_indicies[0],
                                    highlight_alpha=1.0,
                                    voxel_size=voxel_size,
                                    rotation_amount=np.deg2rad(0),
                                    render_gripper=True,
                                    gripper_pose=gripper_pose_mat,
                                    gripper_mesh_scale=voxel_scale,
                                    perspective = False)

        rendered_img_90 = visualise_voxel_video(voxel_grid[0],
                                    None,
                                    None,
                                    coords_indicies[0],
                                    highlight_alpha=1.0,
                                    voxel_size=voxel_size,
                                    rotation_amount=np.deg2rad(90),
                                    render_gripper=True,
                                    gripper_pose=gripper_pose_mat,
                                    gripper_mesh_scale=voxel_scale,
                                    perspective = False)

        rendered_img_180 = visualise_voxel_video(voxel_grid[0],
                                    None,
                                    None,
                                    coords_indicies[0],
                                    highlight_alpha=1.0,
                                    voxel_size=voxel_size,
                                    rotation_amount=np.deg2rad(180),
                                    render_gripper=True,
                                    gripper_pose=gripper_pose_mat,
                                    gripper_mesh_scale=voxel_scale,
                                    perspective = False)

        rendered_img_270 = visualise_voxel_video(voxel_grid[0],
                                    None,
                                    None,
                                    coords_indicies[0],
                                    highlight_alpha=1.0,
                                    voxel_size=voxel_size,
                                    rotation_amount=np.deg2rad(270),
                                    render_gripper=True,
                                    gripper_pose=gripper_pose_mat,
                                    gripper_mesh_scale=voxel_scale,
                                    perspective = False)

        rendered_img_0_persp = visualise_voxel_video(voxel_grid[0],
                                    None,
                                    None,
                                    coords_indicies[0],
                                    highlight_alpha=1.0,
                                    voxel_size=voxel_size,
                                    rotation_amount=np.deg2rad(0),
                                    render_gripper=True,
                                    gripper_pose=gripper_pose_mat,
                                    gripper_mesh_scale=voxel_scale)

        rendered_img_side_persp = visualise_voxel_video(voxel_grid[0],
                                    None,
                                    None,
                                    coords_indicies[0],
                                    highlight_alpha=1.0,
                                    voxel_size=voxel_size,
                                    rotation_amount=np.deg2rad(45),
                                    render_gripper=True,
                                    gripper_pose=gripper_pose_mat,
                                    gripper_mesh_scale=voxel_scale)

                        

        fig = plt.figure(figsize=(20, 15))
        fig.add_subplot(3, 2, 1)
        plt.imshow(rendered_img_0)
        plt.title("0-degree view")
        fig.add_subplot(3, 2, 2)
        plt.imshow(rendered_img_90)
        plt.title("90-degree view")
        fig.add_subplot(3, 2, 3)
        plt.imshow(rendered_img_180)
        plt.title("180-degree view")
        fig.add_subplot(3, 2, 4)
        plt.imshow(rendered_img_270)
        plt.title("270-degree view")
        fig.add_subplot(3, 2, 5)
        plt.imshow(rendered_img_0_persp)
        plt.axis('off')
        plt.title("00-degree view")
        fig.add_subplot(3, 2, 6)
        plt.imshow(rendered_img_side_persp)
        plt.axis('off')
        plt.title("side view")

        # Add timestamp as text with white font and black background
        fig.text(0.02, 0.95, f"Timestep: {analyzed_frame}", ha='left', fontsize=16, color='white', weight='bold',
                 bbox=dict(facecolor='black', edgecolor='none', boxstyle='round,pad=0.3'))

        # Convert the matplotlib figure to a NumPy array
        fig.canvas.draw()
        img_array = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
        img_array = img_array.reshape(fig.canvas.get_width_height()[::-1] + (3,))
        
        video_writer.append_data(img_array)  # Add frame to video
        plt.close(fig)  # Close the figure to free memory

print(f"Video saved as {video_output_path}")