In [1]:
import os, sys
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
from pathlib import Path
from tqdm import tqdm
import numpy as np
import cv2
from PIL import Image
import lovely_tensors as lt
lt.monkey_patch()
import imageio.v3 as iio
import torch

from hmr4d.dataset.pure_motion.utils import augment_betas, interpolate_smpl_params, rotate_around_axis
from hmr4d.dataset.pure_motion.cam_traj_utils import CameraAugmentorV11
from hmr4d.utils.body_model import BodyModelSMPLH, BodyModelSMPLX
from hmr4d.utils.body_model.smplx_lite import SmplxLiteSmplN24
from hmr4d.utils.geo.hmr_global import get_c_rootparam, get_R_c2gv, get_tgtcoord_rootparam, get_T_w2c_from_wcparams
from hmr4d.utils.geo.hmr_cam import create_camera_sensor
from hmr4d.utils.geo_transform import compute_cam_angvel, apply_T_on_points, move_to_start_point_face_z
from hmr4d.utils.net_utils import get_valid_mask
from hmr4d.utils.wis3d_utils import convert_motion_as_line_mesh
from hmr4d.utils.video_io_utils import read_video_np, save_video, get_writer
from hmr4d.utils.vis.renderer import Renderer, get_global_cameras_static, get_ground_params_from_points
from hmr4d.utils.vis.renderer_utils import simple_render_mesh

device = 'cuda:0'

In [2]:
smpl = BodyModelSMPLH(
    model_path="inputs/checkpoints/body_models", model_type="smpl",
    gender="neutral", num_betas=10, create_body_pose=False, 
    create_betas=False, create_global_orient=False, create_transl=False,
).to(device)
smplx = BodyModelSMPLX(
    model_path="inputs/checkpoints/body_models", model_type="smplx",
    gender="neutral", num_pca_comps=12, flat_hand_mean=False,
).to(device)
smplx2smpl = torch.load("hmr4d/utils/body_model/smplx2smpl_sparse.pt").to(device)
faces_smpl = smpl.faces
J_regressor = torch.load("hmr4d/utils/body_model/smpl_neutral_J_regressor.pt").to(device)

smplx_lite = SmplxLiteSmplN24()

In [3]:
# [12/23 15:45:21][INFO] [AMASS] 18086 sequences. Elapsed: 2.35s
# [12/23 15:45:21][INFO] [AMASS] has 64.7 hours motion -> Resampled to 52788 samples.
# [12/23 15:40:11][INFO] [BEDLAM] 37537 sequences. 
# [12/23 15:40:24][INFO] [H36M] 600 sequences. Elapsed: 0.61s
# [12/23 15:40:25][INFO] [H36M] has 8.7 hours motion -> Resampled to 6196 samples.
# [12/23 15:46:37][INFO] [3DPW] has 7.5 minutes motion -> Resampled to 88 samples.

from hmr4d.dataset.pure_motion.amass import AmassDataset #52,788 samples
from hmr4d.dataset.bedlam.bedlam import BedlamDatasetV2 #37,537 samples
from hmr4d.dataset.h36m.h36m import H36mSmplDataset #6,196 samples
from hmr4d.dataset.threedpw.threedpw_motion_train import ThreedpwSmplDataset #88 samples

dataset = H36mSmplDataset()

[[36m12/23 22:58:30[0m][[32mINFO[0m] [H36M] Loading from inputs/H36M/hmr4d_support/smplxpose_v1.pt ...[0m
[[36m12/23 22:58:30[0m][[32mINFO[0m] [H36M] 600 sequences. Elapsed: 0.67s[0m
[[36m12/23 22:58:30[0m][[32mINFO[0m] [H36M] Fully Loading to RAM ViT-Feat: inputs/H36M/hmr4d_support/vitfeat_h36m.pt[0m
[[36m12/23 22:58:32[0m][[32mINFO[0m] [H36M] Finished. Elapsed: 1.65s[0m
[[36m12/23 22:58:32[0m][[32mINFO[0m] [H36M] has 8.7 hours motion -> Resampled to 6196 samples.[0m


In [4]:
batch = dataset[500]
length = batch['length']

K = batch['K_fullimg'][0].to(device)
width, height = int(K[0,2])*2, int(K[1,2])*2
smpl_params_c = {k:v.to(device) for k,v in batch['smpl_params_c'].items()}
verts = smplx(**smpl_params_c).vertices

renderer_c = Renderer(width, height, device="cuda", faces=smplx.faces, K=K)

writer = get_writer('tmp.mp4', fps=30, crf=23)
for i in tqdm(range(length)):
    img = renderer_c.render_mesh(verts[i], None, [0.8, 0.8, 0.8])
    writer.write_frame(img)
writer.close()

100%|██████████| 120/120 [00:03<00:00, 38.56it/s]


In [None]:
w_j3d = smplx(**{k:v.to(device) for k,v in smpl_params_w.items()}).joints.cpu()

width, height, K_fullimg = create_camera_sensor(1000, 1000, 24) 
wham_cam_augmentor = CameraAugmentorV11()
T_w2c = wham_cam_augmentor(w_j3d, 120) 

c_j3d = apply_T_on_points(w_j3d[:,:22], T_w2c)
verts, faces, vertex_colors = convert_motion_as_line_mesh(c_j3d)
vertex_colors = vertex_colors[None] / 255.0
bg = np.ones((height, width, 3), dtype=np.uint8) * 255
renderer = Renderer(width, height, device="cuda", faces=faces, K=K_fullimg)
writer = get_writer(f'tmp.mp4', fps=30, crf=23)
for i in tqdm(range(120), desc=f"Rendering Camera"):
    img_overlay_pred = renderer.render_mesh(verts[i].cuda(), bg, vertex_colors, VI=1)
    writer.write_frame(img_overlay_pred)
writer.close()

In [63]:
### AMASS Train Dataset --Load Dataset-- ###

motion_frames_len = 120
l_factor = 1.5

motion_files = torch.load("inputs/AMASS/hmr4d_support/smplxpose_v2.pth")
seqs = {k: v for k,v in motion_files.items() if 'moyo_smplxn' not in k and v['pose'].shape[0] >= 25}
print(f"Total motion files: {len(seqs):,}")

hours = 0
idx2meta = []
for vid, seq in seqs.items():
    seq_length = seq["pose"].shape[0]
    num_samples = max(seq_length // motion_frames_len, 1)
    hours += seq_length
    idx2meta.extend([vid] * num_samples)
print(f"{hours / (30*3600):.1f} hours motion -> Resampled to {len(idx2meta):,} samples.")

Total motion files: 17,896
64.7 hours motion -> Resampled to 52,788 samples.


In [64]:
### AMASS Train Dataset --Load Data-- ###
idx = 10
np.random.seed(42)

mid = idx2meta[idx]
raw_data = seqs[mid]
raw_len = raw_data["pose"].shape[0]

raw_subset_len = np.random.randint(
    int(motion_frames_len / l_factor), int(motion_frames_len * l_factor)
)
start = np.random.randint(0, raw_len - raw_subset_len + 1) if raw_subset_len <= raw_len else 0
end = start + raw_subset_len if raw_subset_len <= raw_len else raw_len
print(f"{'/'.join(Path(mid).parts[2:])} : {raw_len} -> {start} ~ {end} (len={end-start})")

data = {
    "body_pose": raw_data["pose"][start:end, 3:],  # (F, 63)
    "betas": raw_data["beta"].repeat(end-start, 1),  # (10)
    "global_orient": raw_data["pose"][start:end, :3],  # (F, 3)
    "transl": raw_data["trans"][start:end, :3],  # (F, 3)
    "data_name" : "amass"
}
data = interpolate_smpl_params(data, motion_frames_len)
data["global_orient"], data["transl"], _ = get_tgtcoord_rootparam(
    data["global_orient"], data["transl"], tsf="az->ay",
)
data

smplxn_raw/Transitions/Transitions/mazen_c3d/airkick_longjump_stageii.npz : 306 -> 92 ~ 223 (len=131)


{'body_pose': tensor[120, 63] n=7560 (30Kb) x∈[-1.788, 1.745] μ=0.031 σ=0.302,
 'betas': tensor[120, 10] n=1200 (4.7Kb) x∈[-4.130, 2.712] μ=-0.422 σ=1.990,
 'global_orient': tensor[120, 3] n=360 (1.4Kb) x∈[-0.265, 0.419] μ=-0.003 σ=0.138,
 'transl': tensor[120, 3] n=360 (1.4Kb) x∈[-0.429, 2.674] μ=0.662 σ=0.885}

In [65]:
betas = augment_betas(data["betas"], std=0.1)
global_orient_w, transl_w = rotate_around_axis(data["global_orient"], data["transl"], axis="y")
smpl_params_w = {'body_pose' : data["body_pose"], 'betas': betas, 'global_orient': global_orient_w, 'transl': transl_w}

## Camera Trajectory Augmentation
w_j3d = smplx_lite(
    data["body_pose"][::10], betas[::10], global_orient_w[::10], None,
)
w_j3d = w_j3d.repeat_interleave(10, dim=0) + transl_w[:, None]  # (F, 24, 3)
width, height, K_fullimg = create_camera_sensor(1000, 1000, 24) 
wham_cam_augmentor = CameraAugmentorV11()
T_w2c = wham_cam_augmentor(w_j3d, motion_frames_len) 

offset = smplx.get_skeleton(betas[0].to(device))[0]  # (3)
global_orient_c, transl_c = get_c_rootparam(
    global_orient_w, transl_w, 
    T_w2c, offset.cpu(),
)
smpl_params_c = {
    "body_pose": smpl_params_w["body_pose"].clone(),  # (F, 63)
    "betas": smpl_params_w["betas"].clone(),  # (F, 10)
    "global_orient": global_orient_c,  # (F, 3)
    "transl": transl_c,  # (F, 3)
}
        
# World Params
gravity_vec = torch.tensor([0, -1, 0], dtype=torch.float32)  # (3), BEDLAM is ay
R_c2gv = get_R_c2gv(T_w2c[:, :3, :3], gravity_vec)  # (F, 3, 3)

K_fullimg = K_fullimg.repeat(motion_frames_len, 1, 1)  # (F, 3, 3)
cam_angvel = compute_cam_angvel(T_w2c[:, :3, :3])  # (F, 6)

batch = {
    "meta": {"data_name": "amass", "idx": idx, "T_w2c": T_w2c},
    "length": data["body_pose"].shape[0],
    "smpl_params_c": smpl_params_c,
    "smpl_params_w": smpl_params_w,
    "R_c2gv": R_c2gv,  # (F, 3, 3)
    "gravity_vec": gravity_vec,  # (3)
    "bbx_xys": torch.zeros((data["body_pose"].shape[0], 3)),  # (F, 3)  # NOTE: a placeholder
    "K_fullimg": K_fullimg,  # (F, 3, 3)
    "f_imgseq": torch.zeros((data["body_pose"].shape[0], 1024)),  # (F, D)  # NOTE: a placeholder
    "kp2d": torch.zeros(data["body_pose"].shape[0], 17, 3),  # (F, 17, 3)
    "cam_angvel": cam_angvel,  # (F, 6)
    "mask": {
        "valid": get_valid_mask(data["body_pose"].shape[0], data["body_pose"].shape[0]),
        "vitpose": False,
        "bbx_xys": False,
        "f_imgseq": False,
        "spv_incam_only": False,
    },
}


In [66]:
w_j3d = smplx(**{k:v.to(device) for k,v in smpl_params_w.items()}).joints.cpu()

width, height, K_fullimg = create_camera_sensor(1000, 1000, 24) 
wham_cam_augmentor = CameraAugmentorV11()
T_w2c = wham_cam_augmentor(w_j3d, motion_frames_len) 

c_j3d = apply_T_on_points(w_j3d[:,:22], T_w2c)
verts, faces, vertex_colors = convert_motion_as_line_mesh(c_j3d)
vertex_colors = vertex_colors[None] / 255.0
bg = np.ones((height, width, 3), dtype=np.uint8) * 255
renderer = Renderer(width, height, device="cuda", faces=faces, K=K_fullimg)
writer = get_writer(f'tmp.mp4', fps=30, crf=23)
for i in tqdm(range(motion_frames_len), desc=f"Rendering Camera"):
    img_overlay_pred = renderer.render_mesh(verts[i].cuda(), bg, vertex_colors, VI=1)
    writer.write_frame(img_overlay_pred)
writer.close()

Rendering Camera:   0%|          | 0/120 [00:00<?, ?it/s]

Rendering Camera: 100%|██████████| 120/120 [00:01<00:00, 66.48it/s]


In [None]:
smplx_out = smplx(**{
    "body_pose": data["body_pose"].to(device),  # (F, 63)
    "betas": betas.to(device),  # (F, 10)
    "global_orient": global_orient_w.to(device),  # (F, 3)
    "transl": transl_w.to(device),  # (F, 3)
})
pred_ay_verts = torch.stack([torch.matmul(smplx2smpl, v_) for v_ in smplx_out.vertices])
pred_gb_verts, pred_gb_joints = move_to_start_point_face_z(pred_ay_verts, J_regressor)

global_R, global_T, global_lights = get_global_cameras_static(
    pred_gb_joints.cpu(), beta=2.0, cam_height_degree=20, target_center_height=1.0,
)
_, _, K = create_camera_sensor(width, height, 24)
renderer_g = Renderer(width, height, device="cuda", faces=faces_smpl, K=K)

# -- render mesh -- #
scale, cx, cz = get_ground_params_from_points(pred_gb_joints[:, 0], pred_gb_verts)
renderer_g.set_ground(scale * 1.5, cx, cz)
color = torch.ones(3).float().cuda() * 0.8

writer = get_writer(f'tmp.mp4', fps=30, crf=23)
for i in tqdm(range(motion_frames_len), desc=f"Rendering Global"):
    # img_overlay_pred = renderer.render_mesh(verts[i].cuda(), bg, vertex_colors, VI=1)
    cameras = renderer_g.create_camera(global_R[i], global_T[i])
    img_gb = renderer_g.render_with_ground(pred_gb_verts[[i]], color[None], cameras, global_lights)
    writer.write_frame(img_gb)
writer.close()

In [67]:
### BEDLAM Train Dataset --Load Dataset-- ###

root = Path("inputs/BEDLAM/hmr4d_support")
min_motion_frames = 60
max_motion_frames = 120
lazy_load=True
random1024=False

mid_to_valid_range = {}
mid_to_imgfeat_dir = {}

mid_to_valid_range_ = torch.load(root / "mid_to_valid_range_all60.pt")
mid_to_valid_range.update(mid_to_valid_range_)
mid_to_imgfeat_dir.update({mid: root / "imgfeats/bedlam_all60" for mid in mid_to_valid_range_})

mid_to_valid_range_ = torch.load(root / "mid_to_valid_range_maxspan60.pt")
mid_to_valid_range.update(mid_to_valid_range_)
mid_to_imgfeat_dir.update({mid: root / "imgfeats/bedlam_maxspan60" for mid in mid_to_valid_range_})

motion_files = torch.load(root / "smplpose_v2.pth")
idx2meta = list(mid_to_valid_range.keys())

print(f"Total motion files: {len(idx2meta):,}")


Total motion files: 37,537


In [88]:
idx = 30
np.random.seed(42)

mid = idx2meta[idx]
data = motion_files[mid].copy()

# Random select a subset
range1, range2 = mid_to_valid_range[mid]  # [range1, range2)
mlength = range2 - range1
if mlength < min_motion_frames:  # the minimal mlength is 30 when generating data
    start = range1
    length = mlength
else:
    effect_max_motion_len = min(max_motion_frames, mlength)
    length = np.random.randint(min_motion_frames, effect_max_motion_len + 1)  # [low, high)
    start = np.random.randint(range1, range2 - length + 1)
end = start + length
data["start_end"] = (start, end)
data["length"] = length
for k, v in data.items():
    if isinstance(v, torch.Tensor) and len(v.shape) > 1 and k != "skeleton":
        data[k] = v[start:end]
        
# Load img(as feature) : {mid -> 'features', 'bbx_xys', 'img_wh', 'start_end'}
imgfeat_dir = mid_to_imgfeat_dir[mid]
f_img_dict = torch.load(imgfeat_dir / f"{Path(mid).parts[-3]}/{Path(mid).parts[-1]}.pt")
start_mapped = start - f_img_dict["start_end"][0]
end_mapped = end - f_img_dict["start_end"][0]
data["f_imgseq"] = f_img_dict["features"][start_mapped:end_mapped].float()  # (L, 1024)
data["bbx_xys"] = f_img_dict["bbx_xys"][start_mapped:end_mapped].float()  # (L, 4)
data["img_wh"] = f_img_dict["img_wh"]  # (2)
data["kp2d"] = torch.zeros((end - start), 17, 3)  # (L, 17, 3)  # do not provide kp2d
data

{'pose': tensor[98, 66] n=6468 (25Kb) x∈[-0.946, 1.238] μ=-4.890e-05 σ=0.250,
 'trans': tensor[98, 3] n=294 (1.1Kb) x∈[-0.121, 1.390] μ=0.441 σ=0.666,
 'beta': tensor[10] x∈[-0.719, 0.932] μ=-0.109 σ=0.485 [0.932, -0.589, -0.404, -0.719, 0.136, 0.325, -0.053, -0.170, -0.408, -0.143],
 'skeleton': tensor[22, 3] n=66 x∈[-1.365, 0.709] μ=-0.127 σ=0.390,
 'trans_incam': tensor[98, 3] n=294 (1.1Kb) x∈[-0.663, 0.073] μ=-0.338 σ=0.287,
 'global_orient_incam': tensor[98, 3] n=294 (1.1Kb) x∈[-0.948, 2.963] μ=0.642 σ=1.635,
 'cam_ext': tensor[98, 4, 4] n=1568 (6.1Kb) x∈[-0.567, 2.956] μ=0.455 σ=0.796,
 'cam_int': tensor[98, 3, 3] n=882 (3.4Kb) x∈[0., 995.556] μ=332.457 σ=412.101,
 'start_end': (28, 126),
 'length': 98,
 'f_imgseq': tensor[98, 1024] n=100352 (0.4Mb) x∈[-5.503, 5.472] μ=0.019 σ=1.017,
 'bbx_xys': tensor[98, 3] n=294 (1.1Kb) x∈[296.461, 816.233] μ=564.220 σ=191.347,
 'img_wh': (720, 1280),
 'kp2d': tensor[98, 17, 3] n=4998 (20Kb) [38;2;127;127;127mall_zeros[0m}

In [89]:
body_pose = data["pose"][:, 3:]  # (F, 63)
betas = data["beta"].repeat(length, 1)  # (F, 10)

global_orient = data["global_orient_incam"]  # (F, 3)
transl = data["trans_incam"] + data["cam_ext"][:, :3, 3]  # (F, 3), bedlam convention
smpl_params_c = {"body_pose": body_pose, "betas": betas, "transl": transl, "global_orient": global_orient}

# SMPL params in world
global_orient_w = data["pose"][:, :3]  # (F, 3)
transl_w = data["trans"]  # (F, 3)
smpl_params_w = {"body_pose": body_pose, "betas": betas, "transl": transl_w, "global_orient": global_orient_w}

# World Params
offset = data["skeleton"][0] # (3)
T_w2c = get_T_w2c_from_wcparams(
    global_orient_w=global_orient_w,
    transl_w=transl_w,
    global_orient_c=global_orient,
    transl_c=transl,
    offset=data["skeleton"][0],
)

gravity_vec = torch.tensor([0, -1, 0], dtype=torch.float32)  # (3), BEDLAM is ay
R_c2gv = get_R_c2gv(T_w2c[:, :3, :3], gravity_vec)  # (F, 3, 3)

K_fullimg = data['cam_int'].repeat(motion_frames_len, 1, 1)  # (F, 3, 3)
cam_angvel = compute_cam_angvel(T_w2c[:, :3, :3])  # (F, 6)

batch = {
    "meta": {"data_name": "bedlam", "idx": idx},
    "length": length,
    "smpl_params_c": smpl_params_c,
    "smpl_params_w": smpl_params_w,
    "R_c2gv": R_c2gv,  # (F, 3, 3)
    "gravity_vec": gravity_vec,  # (3)
    "bbx_xys": data["bbx_xys"],  # (F, 3)
    "K_fullimg": data["cam_int"],  # (F, 3, 3)
    "f_imgseq": data["f_imgseq"],  # (F, D)
    "kp2d": data["kp2d"],  # (F, 17, 3)
    "cam_angvel": cam_angvel,  # (F, 6)
    "mask": {
        "valid": get_valid_mask(max_motion_frames, length),
        "vitpose": False,
        "bbx_xys": True,
        "f_imgseq": True,
        "spv_incam_only": False,
    },
}

In [90]:
smplx_out = smplx(**{k: v.to(device) for k,v in smpl_params_c.items()})

# ----- Render Overlay ----- #
render_dict = {
    "faces": smplx.faces,
    "verts": smplx_out.vertices,
    'whf' : (1280, 720, 995.5555)
}
img_overlay = simple_render_mesh(render_dict)
save_video(img_overlay, "tmp.mp4", crf=23)

Rendering:   0%|          | 0/98 [00:00<?, ?it/s]

Rendering: 100%|██████████| 98/98 [00:02<00:00, 46.36it/s]
