In [1]:
import viser
import time
import sys
import argparse
from pathlib import Path
import numpy as onp
import tyro
from tqdm.auto import tqdm

# import viser.src.viser as viser
# import viser.extras as extras
import viser.transforms as tf
# import viser.src.viser.extras as extras
# import viser.src.viser.transforms as tf
import matplotlib.cm as cm  # For colormap

import open3d as o3d
import numpy as np
import asyncio

In [2]:
from __future__ import annotations

import dataclasses
import os
from pathlib import Path
from typing import Tuple, cast

import imageio.v3 as iio
import numpy as np
import numpy as onp
import numpy.typing as onpt
import skimage.transform
from scipy.spatial.transform import Rotation

class Record3dLoader_Customized:
    """Helper for loading frames for Record3D captures."""

    def __init__(self, data_dir: Path, conf_threshold: float = 1.0, foreground_conf_threshold: float = 0.1, no_mask: bool = False, xyzw=True, init_conf=False):

        # Read metadata.
        intrinsics_path = data_dir / "pred_intrinsics.txt"
        intrinsics = np.loadtxt(intrinsics_path)

        self.K: onp.ndarray = np.array(intrinsics, np.float32).reshape(-1, 3, 3)
        fps = 30

        self.init_conf = init_conf

        poses_path = data_dir / "pred_traj.txt"
        poses = np.loadtxt(poses_path)
        self.T_world_cameras: onp.ndarray = np.array(poses, np.float32)
        self.T_world_cameras = np.concatenate(
            [
                # Convert TUM pose to SE3 pose
                Rotation.from_quat(self.T_world_cameras[:, 4:]).as_matrix() if not xyzw
                else Rotation.from_quat(np.concatenate([self.T_world_cameras[:, 5:], self.T_world_cameras[:, 4:5]], -1)).as_matrix(),
                self.T_world_cameras[:, 1:4, None],
            ],
            -1,
        )
        self.T_world_cameras = self.T_world_cameras.astype(np.float32)

        # Convert to homogeneous transformation matrices (ensure shape is (N, 4, 4))
        num_frames = self.T_world_cameras.shape[0]
        ones = np.tile(np.array([0, 0, 0, 1], dtype=np.float32), (num_frames, 1, 1))
        self.T_world_cameras = np.concatenate([self.T_world_cameras, ones], axis=1)

        self.fps = fps
        self.conf_threshold = conf_threshold
        self.foreground_conf_threshold = foreground_conf_threshold
        self.no_mask = no_mask

        # Read frames.
        self.rgb_paths = sorted(data_dir.glob("frame_*.png"), key=lambda p: int(p.stem.split("_")[-1]))
        self.depth_paths = sorted(data_dir.glob("frame_*.npy"), key=lambda p: int(p.stem.split("_")[-1]))
        if init_conf:
            self.init_conf_paths = sorted(data_dir.glob("init_conf_*.npy"), key=lambda p: int(p.stem.split("_")[-1]))
        else:
            self.init_conf_paths = []
        self.conf_paths = sorted(data_dir.glob("conf_*.npy"), key=lambda p: int(p.stem.split("_")[-1]))
        self.mask_paths = sorted(data_dir.glob("enlarged_dynamic_mask_*.png"), key=lambda p: int(p.stem.split("_")[-1]))

        # Remove the last frame since it does not have a ground truth dynamic mask
        self.rgb_paths = self.rgb_paths[:-1]

        # Align all camera poses by the first frame
        T0 = self.T_world_cameras[len(self.T_world_cameras) // 2]  # First camera pose (4x4 matrix)
        T0_inv = np.linalg.inv(T0)    # Inverse of the first camera pose

        # Apply T0_inv to all camera poses
        self.T_world_cameras = np.matmul(T0_inv[np.newaxis, :, :], self.T_world_cameras)


    def num_frames(self) -> int:
        return len(self.rgb_paths)

    def get_frame(self, index: int) -> Record3dFrame:

        # Read depth.
        depth = np.load(self.depth_paths[index])
        depth: onp.NDArray[onp.float32] = depth
        
        # Check if conf file exists, otherwise initialize with ones
        if len(self.conf_paths) == 0:
            conf = np.ones_like(depth, dtype=onp.float32)
        else:
            conf_path = self.conf_paths[index]
            if os.path.exists(conf_path):
                conf = np.load(conf_path)
                conf: onpt.NDArray[onp.float32] = conf
                # Clip confidence to avoid negative values
                conf = np.clip(conf, 0.0001, 99999)
            else:
                conf = np.ones_like(depth, dtype=onp.float32)

        # Check if init conf file exists, otherwise initialize with ones
        if len(self.init_conf_paths) == 0:  # If init conf is not available, use conf
            init_conf = conf
        else:
            init_conf_path = self.init_conf_paths[index]
            if os.path.exists(init_conf_path):
                init_conf = np.load(init_conf_path)
                init_conf: onpt.NDArray[onp.float32] = init_conf
                # Clip confidence to avoid negative values
                init_conf = np.clip(init_conf, 0.0001, 99999)
            else:
                init_conf = np.ones_like(depth, dtype=onp.float32)
        
        # Check if mask file exists, otherwise initialize with zeros
        if len(self.mask_paths) == 0:
            mask = np.ones_like(depth, dtype=onp.bool_)
        else:
            mask_path = self.mask_paths[index]
            if os.path.exists(mask_path):
                mask = iio.imread(mask_path) > 0
                mask: onpt.NDArray[onp.bool_] = mask
            else:
                mask = np.ones_like(depth, dtype=onp.bool_)

        if self.no_mask:
            mask = np.ones_like(mask).astype(np.bool_)

        # Read RGB.
        rgb = iio.imread(self.rgb_paths[index])
        # if 4 channels, remove the alpha channel
        if rgb.shape[-1] == 4:
            rgb = rgb[..., :3]

        return Record3dFrame(
            K=self.K[index],
            rgb=rgb,
            depth=depth,
            mask=mask,
            conf=conf,
            init_conf=init_conf,
            T_world_camera=self.T_world_cameras[index],
            conf_threshold=self.conf_threshold,
            foreground_conf_threshold=self.foreground_conf_threshold,
        )


@dataclasses.dataclass
class Record3dFrame:
    """A single frame from a Record3D capture."""

    K: onpt.NDArray[onp.float32]
    rgb: onpt.NDArray[onp.uint8]
    depth: onpt.NDArray[onp.float32]
    mask: onpt.NDArray[onp.bool_]
    conf: onpt.NDArray[onp.float32]
    init_conf: onpt.NDArray[onp.float32]
    T_world_camera: onpt.NDArray[onp.float32]
    conf_threshold: float = 1.0
    foreground_conf_threshold: float = 0.1

    def get_point_cloud(
        self, downsample_factor: int = 1, bg_downsample_factor: int = 1,
    ) -> Tuple[onpt.NDArray[onp.float32], onpt.NDArray[onp.uint8], onpt.NDArray[onp.float32], onpt.NDArray[onp.uint8]]:
        rgb = self.rgb[::downsample_factor, ::downsample_factor]
        depth = skimage.transform.resize(self.depth, rgb.shape[:2], order=0)
        mask = cast(
            onpt.NDArray[onp.bool_],
            skimage.transform.resize(self.mask, rgb.shape[:2], order=0),
        )
        assert depth.shape == rgb.shape[:2]

        K = self.K
        T_world_camera = self.T_world_camera

        img_wh = rgb.shape[:2][::-1]

        grid = (
            np.stack(np.meshgrid(np.arange(img_wh[0]), np.arange(img_wh[1])), 2) + 0.5
        )
        grid = grid * downsample_factor
        conf_mask = self.conf > self.conf_threshold
        if self.init_conf is not None:
            fg_conf_mask = self.init_conf > self.foreground_conf_threshold
        else:
            fg_conf_mask = self.conf > self.foreground_conf_threshold
        # reshape the conf mask to the shape of the depth
        conf_mask = skimage.transform.resize(conf_mask, depth.shape, order=0)
        fg_conf_mask = skimage.transform.resize(fg_conf_mask, depth.shape, order=0)

        # Foreground points
        homo_grid = np.pad(grid[fg_conf_mask & mask], ((0, 0), (0, 1)), constant_values=1)
        local_dirs = np.einsum("ij,bj->bi", np.linalg.inv(K), homo_grid)
        dirs = np.einsum("ij,bj->bi", T_world_camera[:3, :3], local_dirs)
        points = (T_world_camera[:3, 3] + dirs * depth[fg_conf_mask & mask, None]).astype(np.float32)
        point_colors = rgb[fg_conf_mask & mask]

        # Background points
        bg_homo_grid = np.pad(grid[conf_mask & ~mask], ((0, 0), (0, 1)), constant_values=1)
        bg_local_dirs = np.einsum("ij,bj->bi", np.linalg.inv(K), bg_homo_grid)
        bg_dirs = np.einsum("ij,bj->bi", T_world_camera[:3, :3], bg_local_dirs)
        bg_points = (T_world_camera[:3, 3] + bg_dirs * depth[conf_mask & ~mask, None]).astype(np.float32)
        bg_point_colors = rgb[conf_mask & ~mask]

        if bg_downsample_factor > 1 and bg_points.shape[0] > 0:
            indices = np.random.choice(
                bg_points.shape[0],
                size=bg_points.shape[0] // bg_downsample_factor,
                replace=False
            )
            bg_points = bg_points[indices]
            bg_point_colors = bg_point_colors[indices]
        return points, point_colors, bg_points, bg_point_colors


In [3]:
# same parameters as viser src code defaults
downsample_factor = 1
max_frames = 100
conf_threshold: float = 1
foreground_conf_threshold: float = 0.1
point_size: float = 0.001
camera_frustum_scale: float = 0.02
no_mask: bool = False
xyzw: bool = True
axes_scale: float = 0.25
bg_downsample_factor: int = 1
init_conf: bool = True
cam_thickness: float = 1.5

In [4]:
def align_loader2_to_loader1(loader1, loader2, overlapping_frame_count = 24):
    T1_last = loader1.T_world_cameras[-1]  # Last pose of loader1
    T2_first = loader2.T_world_cameras[0]  # First pose of loader2

    # Compute alignment transformation
    T_align = T1_last @ np.linalg.inv(T2_first)

    # Apply to all of loader2's poses
    loader2.T_world_cameras = np.einsum('ij,njk->nik', T_align, loader2.T_world_cameras)

In [5]:
def main(
    data_paths,
    downsample_factor: int = 1,
    max_frames: int = 2000,
    share: bool = False,
    conf_threshold: float = 1.0,
    foreground_conf_threshold: float = 0.1,
    point_size: float = 0.001,
    camera_frustum_scale: float = 0.02,
    no_mask: bool = False,
    xyzw: bool = True,
    axes_scale: float = 0.25,
    bg_downsample_factor: int = 1,
    init_conf: bool = False,
    cam_thickness: float = 1.5,
) -> None:
    from pathlib import Path  # <-- Import Path here if not already imported
    server = viser.ViserServer()
    if share:
        server.request_share_url()

    server.scene.set_up_direction('-z')
    if no_mask:             # not using dynamic / static mask
        init_conf = True    # must use init_conf map, to avoid depth cleaning
        fg_conf_thre = conf_threshold # now fg_conf_thre is the same as conf_thre
    print("Loading frames!")

    loaders = [Record3dLoader_Customized(
        data_path,
        conf_threshold=conf_threshold,
        foreground_conf_threshold=foreground_conf_threshold,
        no_mask=no_mask,
        xyzw=xyzw,
        init_conf=init_conf,
    ) for data_path in data_paths]


    for i in range(len(loaders) - 1):
        align_loader2_to_loader1(loaders[i], loaders[i + 1])
    

    num_frames = sum([i.num_frames() for i in loaders])

    # Add playback UI.
    with server.gui.add_folder("Playback"):
        gui_timestep = server.gui.add_slider(
            "Timestep",
            min=0,
            max=num_frames - 1,
            step=1,
            initial_value=0,
            disabled=True,
        )
        gui_next_frame = server.gui.add_button("Next Frame", disabled=True)
        gui_prev_frame = server.gui.add_button("Prev Frame", disabled=True)
        gui_playing = server.gui.add_checkbox("Playing", True)
        gui_framerate = server.gui.add_slider(
            "FPS", min=1, max=60, step=0.1, initial_value=loaders[0].fps
        )
        gui_framerate_options = server.gui.add_button_group(
            "FPS options", ("10", "20", "30", "60")
        )
        gui_show_all_frames = server.gui.add_checkbox("Show all frames", False)
        gui_stride = server.gui.add_slider(
            "Stride",
            min=1,
            max=num_frames,
            step=1,
            initial_value=1,
            disabled=True,  # Initially disabled
        )

    # Add recording UI.
    with server.gui.add_folder("Recording"):
        gui_record_scene = server.gui.add_button("Record Scene")

    # Frame step buttons.
    @gui_next_frame.on_click
    def _(_) -> None:
        gui_timestep.value = (gui_timestep.value + 1) % num_frames

    @gui_prev_frame.on_click
    def _(_) -> None:
        gui_timestep.value = (gui_timestep.value - 1) % num_frames

    # Disable frame controls when we're playing.
    @gui_playing.on_update
    def _(_) -> None:
        gui_timestep.disabled = gui_playing.value or gui_show_all_frames.value
        gui_next_frame.disabled = gui_playing.value or gui_show_all_frames.value
        gui_prev_frame.disabled = gui_playing.value or gui_show_all_frames.value

    # Toggle frame visibility when the timestep slider changes.
    @gui_timestep.on_update
    def _(_) -> None:
        nonlocal prev_timestep
        current_timestep = gui_timestep.value
        if not gui_show_all_frames.value:
            with server.atomic():
                frame_nodes[current_timestep].visible = True
                frame_nodes[prev_timestep].visible = False
        prev_timestep = current_timestep
        server.flush()  # Optional!

    # Show or hide all frames based on the checkbox.
    @gui_show_all_frames.on_update
    def _(_) -> None:
        gui_stride.disabled = not gui_show_all_frames.value  # Enable/disable stride slider
        if gui_show_all_frames.value:
            # Show frames with stride
            stride = gui_stride.value
            with server.atomic():
                for i, frame_node in enumerate(frame_nodes):
                    frame_node.visible = (i % stride == 0)
            # Disable playback controls
            gui_playing.disabled = True
            gui_timestep.disabled = True
            gui_next_frame.disabled = True
            gui_prev_frame.disabled = True
        else:
            # Show only the current frame
            current_timestep = gui_timestep.value
            with server.atomic():
                for i, frame_node in enumerate(frame_nodes):
                    frame_node.visible = i == current_timestep
            # Re-enable playback controls
            gui_playing.disabled = False
            gui_timestep.disabled = gui_playing.value
            gui_next_frame.disabled = gui_playing.value
            gui_prev_frame.disabled = gui_playing.value

    # Update frame visibility when the stride changes.
    @gui_stride.on_update
    def _(_) -> None:
        if gui_show_all_frames.value:
            # Update frame visibility based on new stride
            stride = gui_stride.value
            with server.atomic():
                for i, frame_node in enumerate(frame_nodes):
                    frame_node.visible = (i % stride == 0)

    # Recording handler
    @gui_record_scene.on_click
    def _(_):
        gui_record_scene.disabled = True

        # Save the original frame visibility state
        original_visibility = [frame_node.visible for frame_node in frame_nodes]

        rec = server._start_scene_recording()
        rec.set_loop_start()

        # Determine sleep duration based on current FPS
        sleep_duration = 1.0 / gui_framerate.value if gui_framerate.value > 0 else 0.033  # Default to ~30 FPS

        if gui_show_all_frames.value:
            # Record all frames according to the stride
            stride = gui_stride.value
            frames_to_record = [i for i in range(num_frames) if i % stride == 0]
        else:
            # Record the frames in sequence
            frames_to_record = range(num_frames)

        for t in frames_to_record:
            # Update the scene to show frame t
            with server.atomic():
                for i, frame_node in enumerate(frame_nodes):
                    frame_node.visible = (i == t) if not gui_show_all_frames.value else (i % gui_stride.value == 0)
            server.flush()
            rec.insert_sleep(sleep_duration)

        # set all invisible
        with server.atomic():
            for frame_node in frame_nodes:
                frame_node.visible = False

        # Finish recording
        bs = rec.end_and_serialize()

        # Save the recording to a file
        output_path = Path(f"./viser_result/recording_{str(data_path).split('/')[-1]}.viser")
        # make sure the output directory exists
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_bytes(bs)
        print(f"Recording saved to {output_path.resolve()}")

        # Restore the original frame visibility state
        with server.atomic():
            for frame_node, visibility in zip(frame_nodes, original_visibility):
                frame_node.visible = visibility
        server.flush()

        gui_record_scene.disabled = False

    # Load in frames.
    server.scene.add_frame(
        "/frames",
        wxyz=tf.SO3.exp(onp.array([onp.pi / 2.0, 0.0, 0.0])).wxyz,
        position=(0, 0, 0),
        show_axes=False,
    )
    frame_nodes: list[viser.FrameHandle] = []
    bg_positions = []
    bg_colors = []
    curr_loader = 0
    curr_frame_within_loader = 0
    for i in tqdm(range(num_frames)):
        if curr_frame_within_loader == loaders[curr_loader].num_frames():
            curr_loader += 1
            curr_frame_within_loader = 0
        
        frame = loaders[curr_loader].get_frame(curr_frame_within_loader)
        curr_frame_within_loader += 1

        
        position, color, bg_position, bg_color = frame.get_point_cloud(downsample_factor, bg_downsample_factor)

        bg_positions.append(bg_position)
        bg_colors.append(bg_color)

        # Add base frame.
        frame_nodes.append(server.scene.add_frame(f"/frames/t{i}", show_axes=False))

        # Place the point cloud in the frame.
        server.scene.add_point_cloud(
            name=f"/frames/t{i}/point_cloud",
            points=position,
            colors=color,
            point_size=point_size,
            point_shape="rounded",
        )

        # Compute color for frustum based on frame index.
        norm_i = i / (num_frames - 1) if num_frames > 1 else 0  # Normalize index to [0, 1]
        color_rgba = cm.viridis(norm_i)  # Get RGBA color from colormap
        color_rgb = color_rgba[:3]  # Use RGB components

        # Place the frustum with the computed color.
        fov = 2 * onp.arctan2(frame.rgb.shape[0] / 2, frame.K[0, 0])
        aspect = frame.rgb.shape[1] / frame.rgb.shape[0]
        server.scene.add_camera_frustum(
            f"/frames/t{i}/frustum",
            fov=fov,
            aspect=aspect,
            scale=camera_frustum_scale,
            image=frame.rgb[::downsample_factor, ::downsample_factor],
            wxyz=tf.SO3.from_matrix(frame.T_world_camera[:3, :3]).wxyz,
            position=frame.T_world_camera[:3, 3],
            color=color_rgb,  # Set the color for the frustum
            # thickness=cam_thickness,
        )

        # Add some axes.
        server.scene.add_frame(
            f"/frames/t{i}/frustum/axes",
            axes_length=camera_frustum_scale * axes_scale * 10,
            axes_radius=camera_frustum_scale * axes_scale,
        )
    print("loaded frames")
    # Initialize frame visibility.
    for i, frame_node in enumerate(frame_nodes):
        if gui_show_all_frames.value:
            frame_node.visible = (i % gui_stride.value == 0)
        else:
            frame_node.visible = i == gui_timestep.value


    print("initted frame visibitility")
    # Add background frame.
    bg_positions = onp.concatenate(bg_positions, axis=0)
    bg_colors = onp.concatenate(bg_colors, axis=0)
    server.scene.add_point_cloud(
        name=f"/frames/background",
        points=bg_positions,
        colors=bg_colors,
        point_size=point_size,
        point_shape="rounded",
    )

    print("added background frame")
    # Playback update loop.
    prev_timestep = gui_timestep.value
    while True:
        if gui_playing.value and not gui_show_all_frames.value:
            gui_timestep.value = (gui_timestep.value + 1) % num_frames
        time.sleep(1.0 / gui_framerate.value)
    return bg_positions, bg_colors



In [None]:
data_paths = [  
        # Path("cl_start_data/START_CLIP_000.MP4"),
        # Path("cl_start_data/START_CLIP_001.MP4"),
        # Path("cl_start_data/START_CLIP_002.MP4"),
        Path("clipped_data/cl_clip_000"),
        Path("clipped_data/cl_clip_001"),
        Path("clipped_data/cl_clip_002"),
        Path("cl_678_data/cl_678_clip_000"),
        Path("cl_678_data/cl_678_clip_001"),
        Path("cl_678_data/cl_678_clip_002"),
        # Path("cl_end_data/END_CLIP_000.MP4"),
        # Path("cl_end_data/END_CLIP_001.MP4"),
        # Path("cl_end_data/END_CLIP_002.MP4"),
        # Path("cl_end_data/END_CLIP_003.MP4"),
        # Path("cl_end_data/END_CLIP_004.MP4"),
        # Path("cl_end_data/END_CLIP_005.MP4"),
        # Path("cl_end_data/END_CLIP_006.MP4"),
]
bg_positions, bg_colors = main(
        data_paths = data_paths,
        share=True
)

Loading frames!


  0%|          | 0/408 [00:00<?, ?it/s]

loaded frames
initted frame visibitility
added background frame


KeyboardInterrupt: 

In [None]:
def paths_to_pcd(path1, path2, conf_threshold: float = 1.0,
    foreground_conf_threshold: float = 0.1,
    point_size: float = 0.001,
    camera_frustum_scale: float = 0.02,
    no_mask: bool = False,
    xyzw: bool = True,
    axes_scale: float = 0.25,
    bg_downsample_factor: int = 1,
    init_conf: bool = False,
    cam_thickness: float = 1.5,
          downsample_factor = 1):
    loader1 = extras.Record3dLoader_Customized(
        path1,
        conf_threshold=conf_threshold,
        foreground_conf_threshold=foreground_conf_threshold,
        no_mask=no_mask,
        xyzw=xyzw,
        init_conf=init_conf,
    )

    loader2 = extras.Record3dLoader_Customized(
        path2,
        conf_threshold=conf_threshold,
        foreground_conf_threshold=foreground_conf_threshold,
        no_mask=no_mask,
        xyzw=xyzw,
        init_conf=init_conf,
    )
    align_loader2_to_loader1(loader1=loader1, loader2=loader2)
    num_frames = loader1.num_frames() + loader2.num_frames()
    print(num_frames)

    all_positions = []
    all_colors = []

    bg_positions = []
    bg_colors = []
    for i in tqdm(range(num_frames)):
        if i < loader1.num_frames():
            frame = loader1.get_frame(i)
        else:
            frame = loader2.get_frame(i - loader1.num_frames())
        position, color, bg_position, bg_color = frame.get_point_cloud(downsample_factor, bg_downsample_factor)

        all_positions.append(position)
        all_colors.append(color)

        bg_positions.append(bg_position)
        bg_colors.append(bg_color)

    all_positions = onp.concatenate(all_positions + bg_positions, axis=0)
    all_colors = onp.concatenate(all_colors + bg_colors, axis=0)

    final_point_cloud = {
        "points": all_positions,
        "colors": all_colors,
    }
    pcd = o3d.geometry.PointCloud()
    points = final_point_cloud["points"].astype(np.float32)
    colors = final_point_cloud["colors"].astype(np.float32)
    if colors.max() > 1.0:
        colors /= 255.0
    pcd.points = o3d.utility.Vector3dVector(points)
    pcd.colors = o3d.utility.Vector3dVector(colors)
    return pcd

In [17]:
pcd_final = paths_to_pcd(Path("pc3/"), Path("pc4/"))

238


  0%|          | 0/238 [00:00<?, ?it/s]

In [18]:
down_pcd = pcd_final.voxel_down_sample(voxel_size=0.01)

In [19]:
o3d.visualization.draw_geometries([down_pcd],
                                   window_name="Point Cloud Viewer",
                                   width=800,
                                   height=600,
                                   point_show_normal=False)