# Overview: Waymo Open Dataset -- Perception Object Assets

Modeling the 3D world from sensor data for simulation is a scalable way of developing testing and validation environments for robotic learning problems such as autonomous driving. We provide a large-scale object-centric asset dataset containing over 520K images and lidar observations of two major categories (vehicles and pedestrians) from the released Perception data (v2.0.0). We hope this data will enable and advance research on 3D point cloud reconstruction and completion, object NeRF reconstruction, and generative object assets to address the real-world driving challenges with occlusions, lighting-variations, and long-tail distributions.

Please familiarize yourself with the [Perception data v2 format and tutorial](https://github.com/waymo-research/waymo-open-dataset) when proceed.

In [None]:
#@title Initial setup
from typing import Optional, Any, Mapping, Tuple
import warnings
# Disable annoying warnings from PyArrow using under the hood.
warnings.simplefilter(action='ignore', category=FutureWarning)

import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
import plotly  # Used by visu3d
import tensorflow as tf
import visu3d

from waymo_open_dataset import v2
from waymo_open_dataset.utils import camera_segmentation_utils

# Path to the directory with all components
dataset_dir = '<specify actual path>'

context_name = '2736377008667623133_2676_410_2696_410'

def read(tag: str) -> dd.DataFrame:
  """Creates a Dask DataFrame for the component specified by its tag."""
  paths = tf.io.gfile.glob(f'{dataset_dir}/{tag}/{context_name}.parquet')
  return dd.read_parquet(paths)

## Load and visualize camera fields

In [None]:
#@title Visualization utitlity
def apply_color_mask(image: np.ndarray,
                     mask: np.ndarray,
                     color: Tuple[int, int, int],
                     alpha: float = 0.5) -> np.ndarray:
  """Applies the given mask to the image."""
  color = np.array(color)[np.newaxis, :]
  bg = image * (1 - alpha) + alpha * color
  output = np.where(mask, bg, image).astype(np.uint8)
  return output


def grid_imshow(h: int, w: int, images: Any) -> None:
  """Displays images in a grid."""
  fig, axes = plt.subplots(h, w)
  fig.set_size_inches(20, 10)
  fig.tight_layout()
  for i, image in enumerate(images):
    ax = axes[i] if len(images) > 0 else axes
    ax.imshow(image)
    ax.axis('off')

In [None]:
#@title Basic Example (Camera images with rays and labels)

asset_camera_sensor_df = read('object_asset_camera_sensor')
asset_ray_df = read('object_asset_ray')
asset_auto_label_df = read('object_asset_auto_label')
# Load additional LiDAR box dimensions to obtain the ray-box intersection
laser_box_df = read('lidar_box')

asset_df = v2.merge(asset_camera_sensor_df, asset_ray_df)
asset_df = v2.merge(asset_df, asset_auto_label_df)
asset_df = v2.merge(asset_df, laser_box_df)

# Show raw data
asset_df.head()

def parse_key(r: Any) -> Mapping[str, Any]:
  """Parses key from each row."""
  lidar_box_component = v2.LiDARBoxComponent.from_dict(r)
  return {
      'segment_context_name': lidar_box_component.key.segment_context_name,
      'laser_object_id': lidar_box_component.key.laser_object_id,
      'frame_timestamp_micros': lidar_box_component.key.frame_timestamp_micros,
  }

def parse_lidar_box(r: Any) -> Mapping[str, Any]:
  """Parses lidar box from each row."""
  lidar_box_component = v2.LiDARBoxComponent.from_dict(r)
  lidar_box = lidar_box_component.box
  lidar_box_3d = np.asarray(
      [lidar_box.center.x,
       lidar_box.center.y,
       lidar_box.center.z,
       lidar_box.size.x,
       lidar_box.size.y,
       lidar_box.size.z,
       lidar_box.heading], dtype=np.float64)
  return {'box_3d': lidar_box_3d}


def parse_camera_sensor(r: Any) -> Mapping[str, Any]:
  """Parses camera sensor data from each row."""
  camera_sensor_component = v2.ObjectAssetCameraSensorComponent.from_dict(r)
  rgb_image = tf.image.decode_png(camera_sensor_component.rgb_image).numpy()
  proj_points_mask = tf.image.decode_png(
      camera_sensor_component.proj_points_mask).numpy()
  return {
      'rgb_image': rgb_image,
      'proj_points_mask': proj_points_mask}


def parse_camera_ray(r: Any) -> Mapping[str, Any]:
  """Parses camera ray data from each row."""
  box_fields = parse_lidar_box(r)

  ray_component = v2.ObjectAssetRayComponent.from_dict(r)
  ray_origin = ray_component.ray_origin.tensor.numpy()
  ray_direction = ray_component.ray_direction.tensor.numpy()

  im_height, im_width = ray_origin.shape[:2]
  ray_mask, _, _ = v2._object_asset_utils.get_ray_box_intersects(
      ray_origin.reshape(-1, 3),
      ray_direction.reshape(-1, 3),
      box_fields['box_3d'][3:6],
  )

  ray_mask = ray_mask.reshape(im_height, im_width, -1)

  return {'ray_origin': ray_origin, 'ray_direction': ray_direction, 'ray_mask': ray_mask}


def parse_camera_label(r: Any) -> Mapping[str, Any]:
  """Parses camera auto label from each row."""
  auto_label_component = v2.ObjectAssetAutoLabelComponent.from_dict(r)
  obj_mask = tf.image.decode_png(
      auto_label_component.object_mask).numpy()
  semantic_mask = tf.image.decode_png(
      auto_label_component.semantic_mask).numpy()
  instance_mask = tf.image.decode_png(
      auto_label_component.instance_mask, dtype=tf.uint16).numpy()
  return {
      'obj_mask': obj_mask,
      'semantic_mask':semantic_mask,
      'instance_mask': instance_mask}


# Example how to access data fields.
print(f'Available {asset_df.shape[0].compute()} rows:')
for i, (_, r) in enumerate(asset_df.iterrows()):
  # Create component dataclasses for the raw data
  data_fields = {}
  data_fields.update(parse_key(r))
  data_fields.update(parse_lidar_box(r))
  data_fields.update(parse_camera_sensor(r))
  data_fields.update(parse_camera_ray(r))
  data_fields.update(parse_camera_label(r))

  panoptic_image = camera_segmentation_utils.panoptic_label_to_rgb(
      semantic_label=data_fields['semantic_mask'],
      instance_label=data_fields['instance_mask'])

  print(
      'context_name: ', data_fields['segment_context_name'],
      ' ts: ', data_fields['frame_timestamp_micros'],
      ' laser_object_id: ', data_fields['laser_object_id'])
  grid_imshow(1, 5, [
      data_fields['rgb_image'],
      apply_color_mask(
          data_fields['rgb_image'], 
          data_fields['proj_points_mask'],
          color=(0, 255, 0),
          alpha=0.7),
      apply_color_mask(
          data_fields['rgb_image'],
          data_fields['ray_mask'],
          color=(255, 255, 0),
          alpha=0.3),
      apply_color_mask(
          data_fields['rgb_image'],
          data_fields['obj_mask'],
          color=(0, 255, 255),
          alpha=0.3),
          panoptic_image])
  plt.show()
  if i > 2:
    print('...')
    break


## Load and visualize point clouds

In [None]:
#@title Example to visualize lidar points.
asset_df = read('object_asset_lidar_sensor')

all_points_xyz = []
for i, (_, r) in enumerate(asset_df.iterrows()):
  # Create component dataclasses for the raw data
  lidar_sensor_component = v2.ObjectAssetLiDARSensorComponent.from_dict(r)
  print(
      f'context_name: {lidar_sensor_component.key.segment_context_name}',
      f' ts: {lidar_sensor_component.key.frame_timestamp_micros}',
      f' laser_object_id: {lidar_sensor_component.key.laser_object_id}')
  
  points_xyz = lidar_sensor_component.points_xyz.tensor.numpy()
  all_points_xyz.append(points_xyz)
  if i > 2:
    break

v3d_point_cloud = visu3d.Point3d(
    p=np.concatenate(all_points_xyz, axis=0),
)
v3d_point_cloud.fig