In [1]:
# os.chdir("hmr4d/")
from pathlib import Path
import numpy as np
from einops import einsum
import cv2
from PIL import Image
import lovely_tensors as lt
lt.monkey_patch()
import imageio.v3 as iio
import torch

from hmr4d.utils.preproc import Tracker, Extractor, VitPoseExtractor, SimpleVO
from hmr4d.utils.geo.hmr_cam import get_bbx_xys_from_xyxy, estimate_K, create_camera_sensor
from hmr4d.utils.geo.hmr_cam import compute_bbox_info_bedlam, compute_transl_full_cam, normalize_kp2d
from hmr4d.utils.geo_transform import compute_cam_angvel, apply_T_on_points, compute_T_ayfz2ay
from hmr4d.utils.body_model import BodyModelSMPLH, BodyModelSMPLX
from hmr4d.utils.vis.renderer import Renderer, get_global_cameras_static, get_ground_params_from_points

### 1. parse_args_to_cfg

In [2]:
# demo.py -> L41~55
video_path = Path("inputs/bodycam/Clip 1 1 Clip 1 2 Axon Body 4 Video 2024 12 06 1558 D01A61897 [P1-Va9VIZsI].webm")
output_root = Path("outputs/demo")
static_cam = False
use_dpvo = False
f_mm = None

# demo.py -> L60
# hmr4d.utils.video_io_utils.get_video_lwh
length, height, width, c = iio.improps(video_path, plugin="pyav").shape
if length == 0:
    video = cv2.VideoCapture(str(video_path))
    fps = video.get(cv2.CAP_PROP_FPS)
    length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))-1
print(f"{str(video_path)}: {width}x{height}x{length}")
# demo.py -> L64~79
# Cfg
# import hydra
# from hydra import compose, initialize_config_module
# from hmr4d.configs import store_gvhmr

# with initialize_config_module(version_base="1.3", config_module=f"hmr4d.configs"):
#     overrides = [
#         f"video_name={video_path.stem}",
#         f"static_cam={static_cam}",
#         f"verbose={True}",
#         f"use_dpvo={use_dpvo}",
#         f"f_mm={f_mm}",
#         f"output_root={output_root}",
#     ]
#     cfg = compose(config_name="demo", overrides=overrides)

inputs/bodycam/Clip 1 1 Clip 1 2 Axon Body 4 Video 2024 12 06 1558 D01A61897 [P1-Va9VIZsI].webm: 960x720x271


<details>
  <summary>CFG: Click to expand</summary>
'pipeline': <br/>
&emsp;'_target_': 'hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline' <br/>
&emsp;'args_denoiser3d': '${network}' <br/>
&emsp;'args': <br/>
&emsp;&emsp;'endecoder_opt': '${endecoder}' <br/>
&emsp;&emsp;'normalize_cam_angvel': True <br/>
&emsp;&emsp;'weights': None <br/>
&emsp;&emsp;'static_conf': None <br/>
'ckpt_path': 'inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt' <br/> 
'video_name': 'tennis' <br/>
'output_root': 'outputs/demo' <br/>
'output_dir': '${output_root}/${video_name}' <br/>
'preprocess_dir': '${output_dir}/preprocess' <br/>
'video_path': '${output_dir}/0_input_video.mp4' <br/>
'static_cam': False <br/>
'verbose': True <br/>
'use_dpvo': True <br/>
'f_mm': 'None' <br/>
'paths': {
    'bbx': '${preprocess_dir}/bbx.pt' <br/>
    'bbx_xyxy_video_overlay': '${preprocess_dir}/bbx_xyxy_video_overlay.mp4' <br/>
    'vit_features': '${preprocess_dir}/vit_features.pt' <br/>
    'vitpose': '${preprocess_dir}/vitpose.pt' <br/> 
    'vitpose_video_overlay': '${preprocess_dir}/vitpose_video_overlay.mp4' <br/>
    'hmr4d_results': '${output_dir}/hmr4d_results.pt' <br/>
    'incam_video': '${output_dir}/1_incam.mp4' <br/>
    'global_video': '${output_dir}/2_global.mp4' <br/>
    'incam_global_horiz_video': '${output_dir}/${video_name}_3_incam_global_horiz.mp4' <br/>
    'slam': '${preprocess_dir}/slam_results.pt'
}, 
'model': {'_target_': 'hmr4d.model.gvhmr.gvhmr_pl_demo.DemoPL', 'pipeline': '${pipeline}'} <br/> 
'network': {
    '_target_': 'hmr4d.network.gvhmr.relative_transformer.NetworkEncoderRoPE', 
    'output_dim': 151, 
    'max_len': 120, 
    'cliffcam_dim': 3, 
    'cam_angvel_dim': 6, 
    'imgseq_dim': 1024, 
    'latent_dim': 512, 
    'num_layers': 12, 
    'num_heads': 8, 
    'mlp_ratio': 4.0, 
    'pred_cam_dim': 3, 
    'static_conf_dim': 6, 
    'dropout': 0.1, 
    'avgbeta': True} <br/>
    'endecoder': { <br/>
    '_target_': 'hmr4d.model.gvhmr.utils.endecoder.EnDecoder', <br/>
    'stats_name': 'MM_V1_AMASS_LOCAL_BEDLAM_CAM', <br/>
    'noise_pose_k': 10} <br/>
</details>

### 2. Run Preprocess

In [3]:
tracker = Tracker()
# bbx_xyxy = tracker.get_one_track(video_path).float()  # (L, 4)
track_history = tracker.track(video_path)
id_to_frame_ids, id_to_bbx_xyxys, id_sorted = tracker.sort_track_length(track_history, video_path)

YoloV8 Tracking: 0it [00:00, ?it/s]



YoloV8 Tracking: 271it [00:23, 11.39it/s]


In [5]:
from hmr4d.utils.seq_utils import (
    get_frame_id_list_from_mask,
    linear_interpolate_frame_ids,
)
from hmr4d.utils.net_utils import moving_average_smooth

track_id = 1
frame_ids = torch.tensor(id_to_frame_ids[track_id])  # (N,)
bbx_xyxys = torch.tensor(id_to_bbx_xyxys[track_id])  # (N, 4)

# mask = frame_id_to_mask(frame_ids, length)
mask = torch.zeros(length, dtype=torch.bool)
mask[frame_ids] = True
# bbx_xyxy_one_track = rearrange_by_mask(bbx_xyxys, mask)  # (F, 4), missing filled with 0
bbx_xyxy_one_track = torch.zeros((length, 4), dtype=bbx_xyxys.dtype)
bbx_xyxy_one_track[mask] = bbx_xyxys
missing_frame_id_list = get_frame_id_list_from_mask(~mask)  # list of list
bbx_xyxy_one_track = linear_interpolate_frame_ids(bbx_xyxy_one_track, missing_frame_id_list)

bbx_xyxy_one_track = moving_average_smooth(bbx_xyxy_one_track, window_size=5, dim=0)
bbx_xyxy_one_track = moving_average_smooth(bbx_xyxy_one_track, window_size=5, dim=0)
bbx_xyxy = bbx_xyxy_one_track
bbx_xys = get_bbx_xys_from_xyxy(bbx_xyxy, base_enlarge=1.2).float()  # (L, 3) apply aspect ratio and enlarge

In [6]:
from hmr4d.utils.video_io_utils import read_video_np, save_video
from hmr4d.utils.vis.cv2_utils import draw_bbx_xyxy_on_image_batch

video = read_video_np(video_path)
video_overlay = np.stack(draw_bbx_xyxy_on_image_batch(bbx_xyxy, video), axis=0)
save_video(video_overlay, 'tmp.mp4')

In [None]:
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML

# Assuming 'video_frames' is a numpy array of shape (frames, height, width, channels)
# ... code to generate video_frames ...

fig = plt.figure()
im = plt.imshow(video_overlay[0,:,:,:])
plt.close() # Prevents the initial frame from displaying separately

def animate(i):
    im.set_data(video_overlay[i,:,:,:])
    return [im]

anim = animation.FuncAnimation(fig, animate, frames=video_overlay.shape[0], interval=50) # 50ms interval = 20 fps
HTML(anim.to_html5_video())

In [None]:
# # demo.py -> L110~112
# # Bbox Tracking (Yolov8x)

# tracker = Tracker()
# # bbx_xyxy = tracker.get_one_track(video_path).float()  # (L, 4)
# track_history = tracker.track(video_path)
# id_to_frame_ids, id_to_bbx_xyxys, id_sorted = tracker.sort_track_length(track_history, video_path)

# bbx_xys = get_bbx_xys_from_xyxy(bbx_xyxy, base_enlarge=1.2).float()  # (L, 3) apply aspect ratio and enlarge

YoloV8 Tracking: 271it [00:11, 22.77it/s]


IndexError: index is out of bounds for dimension with size 0

In [46]:
# demo.py -> L126~127
# Keypoint Extracting (ViTPose)

vitpose_extractor = VitPoseExtractor()
vitpose = vitpose_extractor.extract(str(video_path), bbx_xys) # (L, 17, 3)

ViTPose: 100%|██████████| 17/17 [00:11<00:00,  1.49it/s]


In [7]:
# demo.py -> 140~141

extractor = Extractor()
vit_features = extractor.extract_video_features(str(video_path), bbx_xys) # (L, 1024)

  return F.conv2d(input, weight, bias, self.stride,
HMR2 Feature: 100%|██████████| 70/70 [00:23<00:00,  2.99it/s]


In [8]:
# demo.py -> 151~152

simple_vo = SimpleVO(video_path, scale=0.5, step=8, method="sift", f_mm=None)
vo_results = simple_vo.compute() # (L, 4, 4)
R_w2c = torch.from_numpy(vo_results[:, :3, :3]) # (L, 3, 3)
K_fullimg = estimate_K(width, height).repeat(length, 1, 1) # (L, 3, 3)

[SimpleVO] Choosen frames shape: (140, 1080, 1920, 3)
TwoViewGeometryOptions:
    min_num_inliers = 10
    min_E_F_inlier_ratio = 0.8
    max_H_inlier_ratio = 0.9
    watermark_min_inlier_ratio = 0.7
    watermark_border_size = 0.1
    detect_watermark = True
    multiple_ignore_watermark = True
    watermark_detection_max_error = 4.0
    filter_stationary_matches = False
    stationary_matches_max_error = 4.0
    force_H_use = False
    compute_relative_pose = True
    multiple_models = False
    ransac: RANSACOptions:
        max_error = 4.0
        min_inlier_ratio = 0.25
        confidence = 0.999
        dyn_num_trials_multiplier = 3.0
        min_num_trials = 100
        max_num_trials = 10000
        random_seed = -1


100%|██████████| 139/139 [02:02<00:00,  1.14it/s]


### 3. Model Inference

In [16]:
from hmr4d.network.gvhmr.relative_transformer import NetworkEncoderRoPE
from hmr4d.model.gvhmr.utils.endecoder import EnDecoder
from hmr4d.model.gvhmr.pipeline.gvhmr_pipeline import get_smpl_params_w_Rt_v2
from hmr4d.model.gvhmr.utils.postprocess import (
    pp_static_joint,
    process_ik,
    pp_static_joint_cam,
)

In [None]:
# from hmr4d.model.gvhmr.gvhmr_pl_demo import DemoPL
# model: DemoPL = hydra.utils.instantiate(cfg.model, _recursive_=False)
# model.load_pretrained_model(cfg.ckpt_path)
# model = model.eval().cuda()

denoiser3d = NetworkEncoderRoPE().eval().cuda()
endecoder = EnDecoder(stats_name="MM_V1_AMASS_LOCAL_BEDLAM_CAM").eval().cuda()

cam_angvel_mean = torch.tensor([1., 0., 0., 0., 1., 0.], device='cuda')
cam_angvel_std = torch.tensor( [1e-3, 0.1, 0.1, 0.1, 1e-3, 0.1], device='cuda')

state_dict = torch.load('inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt')['state_dict']
transf_state_dict = {key.replace("pipeline.denoiser3d.", ""): value for key, value in state_dict.items() if "pipeline.denoiser3d." in key}
denoiser3d.load_state_dict(transf_state_dict)

[[36m12/12 14:09:17[0m][[32mINFO[0m] [EnDecoder] Use MM_V1_AMASS_LOCAL_BEDLAM_CAM for statistics![0m


<All keys matched successfully>

In [None]:
# from pytorch_lightning.utilities.memory import recursive_detach

# pred = model.predict({
#     "length": torch.tensor(length),
#     "bbx_xys": bbx_xys,
#     "kp2d": vitpose,
#     "K_fullimg": K_fullimg,
#     "cam_angvel": compute_cam_angvel(R_w2c),
#     "f_imgseq": vit_features,
# }, static_cam=static_cam)
# pred = recursive_detach(pred)

In [None]:

cliff_cam = compute_bbox_info_bedlam(bbx_xys[None], K_fullimg[None]).to('cuda')  # (B, L, 3)
f_cam_angvel = compute_cam_angvel(R_w2c).to('cuda')
f_condition = {
    "obs": normalize_kp2d(vitpose, bbx_xys)[None].to('cuda'),  # (B, L, J, 3)
    "f_cliffcam": cliff_cam,  # (B, L, 3)
    "f_cam_angvel": (f_cam_angvel - cam_angvel_mean) / cam_angvel_std,  # (B, L, C=6)
    "f_imgseq": vit_features[None].to('cuda'),  # (B, L, C=1024)
}

In [13]:
with torch.no_grad():
    model_output = denoiser3d(length=torch.tensor([length], device='cuda'), **f_condition)  # pred_x, pred_cam, static_conf_logits
    decode_dict = endecoder.decode(model_output["pred_x"]) 

In [19]:
outputs = {}
# Post-processing
outputs["pred_smpl_params_incam"] = {
    "body_pose": decode_dict["body_pose"],  # (B, L, 63)
    "betas": decode_dict["betas"],  # (B, L, 10)
    "global_orient": decode_dict["global_orient"],  # (B, L, 3)
    "transl": compute_transl_full_cam(
        model_output["pred_cam"], bbx_xys.to('cuda'), K_fullimg.to('cuda')),
}

pred_smpl_params_global = get_smpl_params_w_Rt_v2(  # This function has for-loop
    global_orient_gv=decode_dict["global_orient_gv"],
    local_transl_vel=decode_dict["local_transl_vel"],
    global_orient_c=decode_dict["global_orient"],
    cam_angvel=f_cam_angvel[None],
)
outputs["pred_smpl_params_global"] = {
    "body_pose": decode_dict["body_pose"],
    "betas": decode_dict["betas"],
    **pred_smpl_params_global,
}
outputs["static_conf_logits"] = model_output["static_conf_logits"]

if static_cam:  # extra post-processing to utilize static camera prior
    outputs["pred_smpl_params_global"]["transl"] = pp_static_joint_cam(outputs, endecoder)
else:
    outputs["pred_smpl_params_global"]["transl"] = pp_static_joint(outputs, endecoder)

body_pose = process_ik(outputs, endecoder)
decode_dict["body_pose"] = body_pose
outputs["pred_smpl_params_global"]["body_pose"] = body_pose
outputs["pred_smpl_params_incam"]["body_pose"] = body_pose

pred = {
    "smpl_params_global": {k: v[0] for k, v in outputs["pred_smpl_params_global"].items()},
    "smpl_params_incam": {k: v[0] for k, v in outputs["pred_smpl_params_incam"].items()},
}

### 4. Render InCamera View

In [18]:
smpl = BodyModelSMPLH(
    model_path="inputs/checkpoints/body_models", model_type="smpl",
    gender="neutral", num_betas=10, create_body_pose=False, 
    create_betas=False, create_global_orient=False, create_transl=False,
).cuda()
smplx = BodyModelSMPLX(
    model_path="inputs/checkpoints/body_models", model_type="smplx",
    gender="neutral", num_pca_comps=12, flat_hand_mean=False,
).cuda()
smplx2smpl = torch.load("hmr4d/utils/body_model/smplx2smpl_sparse.pt").cuda()
faces_smpl = smpl.faces
J_regressor = torch.load("hmr4d/utils/body_model/smpl_neutral_J_regressor.pt").cuda()

In [20]:
# smpl
smplx_out = smplx(**pred["smpl_params_incam"])
pred_c_verts = torch.stack([torch.matmul(smplx2smpl, v_) for v_ in smplx_out.vertices])

renderer_c = Renderer(width, height, device="cuda", faces=faces_smpl, K=K_fullimg[0])


In [21]:
smplx_out = smplx(**pred["smpl_params_global"])
pred_ay_verts = torch.stack([torch.matmul(smplx2smpl, v_) for v_ in smplx_out.vertices])

pred_ay_verts = pred_ay_verts.clone()  # (L, V, 3)
offset = einsum(J_regressor, pred_ay_verts[0], "j v, v i -> j i")[0]  # (3)
offset[1] = pred_ay_verts[:, :, [1]].min()
pred_gb_verts = pred_ay_verts - offset
# face direction
T_ay2ayfz = compute_T_ayfz2ay(einsum(J_regressor, pred_gb_verts[[0]], "j v, l v i -> l j i"), inverse=True)
pred_gb_verts = apply_T_on_points(pred_gb_verts, T_ay2ayfz)

pred_gb_joints = einsum(J_regressor, pred_gb_verts, "j v, l v i -> l j i")  # (L, J, 3)
global_R, global_T, global_lights = get_global_cameras_static(
    pred_gb_joints.cpu(), beta=2.0, cam_height_degree=20, target_center_height=1.0,
)

_, _, K = create_camera_sensor(width, height, 24)
renderer_g = Renderer(width, height, device="cuda", faces=faces_smpl, K=K)

# -- render mesh -- #
scale, cx, cz = get_ground_params_from_points(pred_gb_joints[:, 0], pred_gb_verts)
renderer_g.set_ground(scale * 1.5, cx, cz)
color = torch.ones(3).float().cuda() * 0.8


In [22]:
i = 49

img_raw = iio.imread(video_path, index=i)
img_cam = renderer_c.render_mesh(pred_c_verts[i].cuda(), img_raw, [0.8, 0.8, 0.8])
cameras = renderer_g.create_camera(global_R[i], global_T[i])
img_gb = renderer_g.render_with_ground(pred_gb_verts[[i]], color[None], cameras, global_lights)
Image.fromarray(np.concatenate([img_raw, img_cam, img_gb], axis=1))