In [43]:
from pathlib import Path
from typing import Optional, Callable, Dict, List
import numpy as np
import torch
from torch.utils.data import Dataset
from projectaria_tools.core.sensor_data import ImageData
from projectaria_tools.core.stream_id     import StreamId
from projectaria_tools.core.sophus        import SE3, SO3
from projectaria_tools.core               import data_provider
from nymeria.data_provider      import NymeriaDataProvider
from nymeria.recording_data_provider import RecordingDataProvider, AriaStream
from torch.utils.data import DataLoader
from torchvision.transforms import Normalize, Compose

In [None]:
class NymeriaPoseDataset(Dataset):
    """
    Returns for each index
        • RGB frame  (C,H,W)  float32  [0‑1]
        • 3‑D joints in camera frame (23,3)
        • 2‑D projected joints        (23,2)   (‑1 if outside)
    """

    def __init__(
        self,
        seq_root: Path,
        transform: Optional[Callable] = None,
        half: bool = True,
    ):
        super().__init__()

        self.seq_root = Path(seq_root)
        self.transform = transform
        self.half = half

        # --- Nymeria providers -------------------------------------------------
        self.dp = NymeriaDataProvider(
            sequence_rootdir=self.seq_root, load_wrist=False, load_observer=False
        )
        self.rec_head: RecordingDataProvider = self.dp.recording_head
        assert self.rec_head and self.rec_head.has_rgb, "no RGB stream found"

        # --- VRS & calibration -------------------------------------------------
        self.rgb_sid = StreamId(AriaStream.camera_rgb.value)
        self.vrs_dp = self.rec_head.vrs_dp
        self._num_frames = self.vrs_dp.get_num_data(self.rgb_sid)

        self.cam_calib = (
            self.vrs_dp.get_device_calibration().get_camera_calib("camera-rgb")
        )
        w, h = self.cam_calib.get_image_size()
        if half:
            w, h = w // 2, h // 2
        self.img_size = (h, w)

    # ------------------------------------------------------------------ #
    def __len__(self) -> int:
        return self._num_frames

    # ------------------------------------------------------------------ #
    @staticmethod
    def _unique_joints(bones: np.ndarray) -> np.ndarray:
        """XSens bones → 23 distinct joint positions."""
        pts = np.zeros((23, 3), np.float32)
        for b, (child, parent) in enumerate(bones):
            pts[b + 1] = child
            if b == 0:
                pts[0] = parent
        return pts

    # ------------------------------------------------------------------ #
    def __getitem__(self, idx: int):
        # ---------- RGB & meta -----------------------------------------
        img_data, meta = self.vrs_dp.get_image_data_by_index(self.rgb_sid, idx)

        # -------- time‑code timestamp ----------------------------------
        if hasattr(meta, "time_code_timestamp_ns"):
            t_code_ns = meta.time_code_timestamp_ns
        elif hasattr(self.vrs_dp, "convert_device_time_to_timecode_time_ns"):
            t_code_ns = self.vrs_dp.convert_device_time_to_timecode_time_ns(
                meta.capture_timestamp_ns
            )
        else:  # fallback – device‑time
            t_code_ns = meta.capture_timestamp_ns

        # -------- ground‑truth poses -----------------------------------
        poses = self.dp.get_synced_poses(t_code_ns)
        bones = poses["xsens"]  # (22,2,3)
        joints_w = self._unique_joints(bones)  # (23,3)

        # -------- world → camera transform -----------------------------
        T_W_D: SE3 = poses["recording_head"].transform_world_device
        T_D_C: SE3 = self.cam_calib.get_transform_device_camera()

        R_WD = T_W_D.rotation().to_matrix()
        t_WD = T_W_D.translation().ravel()  # (3,)
        R_DC = T_D_C.rotation().to_matrix()
        t_DC = T_D_C.translation().ravel()  # (3,1)

        R_WC = R_WD @ R_DC
        t_WC = (R_WD @ t_DC).ravel() + t_WD

        # joints in camera frame
        joints_c = (R_WC.T @ (joints_w - t_WC).T).T  # (23,3)

        # -------- 2‑D projection ---------------------------------------
        uv = []
        for p in joints_c:
            pix = self.cam_calib.project(p)
            if pix is None:
                uv.append([-1.0, -1.0])
            else:
                u, v = pix
                if self.half:
                    u, v = u / 2.0, v / 2.0
                uv.append([u, v])
        joints_2d = torch.tensor(uv, dtype=torch.float32)  # (23,2)

        # -------- RGB tensor -------------------------------------------
        arr = img_data.to_numpy_array()
        if self.half:
            arr = arr[::2, ::2]
        frame = torch.from_numpy(arr).permute(2, 0, 1).float() / 255.0
        if self.transform:
            frame = self.transform(frame)

        return frame, torch.from_numpy(joints_c).float(), joints_2d

In [45]:
root    = Path(r"C:\Users\Damir\nymeria_dataset\d"
               r"\20230622_s0_john_solomon_act2_8urygm")
norm = Compose([Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5])])
ds      = NymeriaPoseDataset(root, transform=norm, half=True)
loader  = DataLoader(ds, batch_size=8, shuffle=True,
                     num_workers=4, pin_memory=True)

[32m2025-05-07 15:19:24.334[0m | [1mINFO    [0m | [36mnymeria.body_motion_provider[0m:[36m__init__[0m:[36m31[0m - [1mloading xsens from npzfile='C:\\Users\\Damir\\nymeria_dataset\\d\\20230622_s0_john_solomon_act2_8urygm\\body\\xdata.npz'[0m
[32m2025-05-07 15:19:24.808[0m | [1mINFO    [0m | [36mnymeria.body_motion_provider[0m:[36m__init__[0m:[36m34[0m - [1mk='segment_qWXYZ', v.shape=(144289, 92)[0m
[32m2025-05-07 15:19:24.809[0m | [1mINFO    [0m | [36mnymeria.body_motion_provider[0m:[36m__init__[0m:[36m34[0m - [1mk='segment_tXYZ', v.shape=(144289, 69)[0m
[32m2025-05-07 15:19:24.809[0m | [1mINFO    [0m | [36mnymeria.body_motion_provider[0m:[36m__init__[0m:[36m34[0m - [1mk='segment_velocity', v.shape=(144289, 69)[0m
[32m2025-05-07 15:19:24.810[0m | [1mINFO    [0m | [36mnymeria.body_motion_provider[0m:[36m__init__[0m:[36m34[0m - [1mk='segment_acceleration', v.shape=(144289, 69)[0m
[32m2025-05-07 15:19:24.811[0m | [1mINFO    [0

In [46]:
import matplotlib.pyplot as plt
import torchvision.transforms.functional as F

# ---- grab one sample (first frame) ---------------------------------
img_t, joints_3d, joints_2d = ds[0]          # (C,H,W), (23,3), (23,2)

# undo the Normalize() so colours look right
img_t_vis = F.normalize(img_t,
                        mean=[-1, -1, -1],   # invert the earlier (x-0.5)/0.5
                        std =[ 0.5,  0.5, 0.5])

img_np = img_t_vis.permute(1,2,0).numpy()    # → HWC in [0,1]

# ---- skeleton connectivity -----------------------------------------
parents = [
    -1,  # Pelvis has no parent
     0,1,2,3,4,5,4,7,8,9,4,11,12,13,0,15,16,17,0,19,20,21
]                                          # from XSensConstants.kintree_parents

# ---- draw -----------------------------------------------------------
fig,ax = plt.subplots(figsize=(6,6))
ax.imshow(img_np)
ax.axis('off')

# joints_2d is (u,v) ; split for convenience
u,v = joints_2d[:,0].numpy(), joints_2d[:,1].numpy()

# draw bones
for i,p in enumerate(parents):
    if p < 0:           # root
        continue
    if u[i] < 0 or u[p] < 0:   # one of the points was outside image
        continue
    ax.plot([u[i],u[p]], [v[i],v[p]], c='lime', lw=2)

# draw joints
ax.scatter(u[v>=0], v[v>=0], c='red', s=15)

plt.title("Frame 0 with projected XSens skeleton")
plt.show()




ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 3)