In [1]:
%load_ext autoreload
%autoreload 2

import os
import matplotlib.pyplot as plt

# Change working directory to project root
os.chdir('../../')

import torch
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
import imageio
import decord

In [None]:
from hmr4d.network.hpe.hybrik import HRNetSMPLCam, cfg_model, MEAN, STD
CKPT = "inputs/checkpoints/hybrik/hybrik_hrnet48_w3dpw.pth"

hybrik_model = HRNetSMPLCam(**cfg_model).eval()
save_dict = torch.load(CKPT, map_location='cpu')
hybrik_model.load_state_dict(save_dict, strict=False)
hybrik_model = hybrik_model.cuda()

### 1 Image

In [None]:
img_path = "inputs/RICH/sahmr_support/test_split/image/Gym_010_cooking1_4_00356_10.png"
img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)  # [H, W, C]
img = cv2.resize(img, (256, 256))
img_ts = torch.from_numpy(img).float().unsqueeze(0).permute(0, 3, 1, 2) / 255.0  # [1, C, H, W]
img_input = (img_ts - MEAN) / STD

In [None]:
with torch.no_grad():
    pred_kpts = hybrik_model(img_input)
    pred_joints24_xy = pred_kpts[:, :, :2] * 256

from hmr4d.utils.vis.vis_kpts import draw_kpts_cv2
img_ = draw_kpts_cv2(img, pred_joints24_xy[0].cpu().numpy())

# make plt fig smaller
fig = plt.figure(figsize=(3, 3), dpi=70)
plt.axis('off')
plt.imshow(img_)
# remove axis

### Output all estimations

In [None]:
import joblib
rich_test_vit_path = "inputs/RICH/eval_support/rich_test_vit.pth"
labels = joblib.load(rich_test_vit_path)
rich_video_dir = Path("inputs/RICH/hmr4d_support/video")


vid_to_pred_kpts = {}
for index in tqdm(range(len(labels["vid"]))):
    bbox = labels["bbox"][index][1:]  # (F, 3) # 3: (x, y, s)
    frame_id = labels["frame_id"][index][1:]  # (F, )
    vid = labels["vid"][index]

    decord.bridge.set_bridge("torch") 
    vr = decord.VideoReader(str(rich_video_dir / vid / "video.mp4"))
    frames = vr.get_batch(list(frame_id.numpy()))  # (F, 752, 1024, 3)


    # Crop images according to bbox
    bbox_src = torch.stack(
        [
            torch.stack([bbox[:, 0] - bbox[:, 2] / 2, bbox[:, 1] - bbox[:, 2] / 2], dim=-1),
            torch.stack([bbox[:, 0] + bbox[:, 2] / 2, bbox[:, 1] - bbox[:, 2] / 2], dim=-1),
            bbox[:, :2],
        ],
        dim=1,
    )
    bbox_src = bbox_src.numpy()  # (F, 3, 2)
    dst_size = 256
    bbox_dst = np.array([[0, 0], [dst_size - 1, 0], [dst_size / 2 - 0.5, dst_size / 2 - 0.5]], dtype=np.float32)
    As = [cv2.getAffineTransform(src / 4, bbox_dst) for src in bbox_src]
    img_crops = [
        cv2.warpAffine(frames[i].numpy(), As[i], (dst_size, dst_size), flags=cv2.INTER_LINEAR) for i in range(len(As))
    ]

    img_inputs = torch.stack([torch.from_numpy(img).permute(2, 0, 1) for img in img_crops], dim=0).float() / 255.0
    img_inputs = (img_inputs - MEAN) / STD
    img_inputs = img_inputs.cuda()

    pred_kpts = []
    batch_size = 64
    for i in tqdm(range(0, len(img_inputs), batch_size)):
        with torch.no_grad():
            pred_kpts.append(hybrik_model(img_inputs[i:i+batch_size]))
    pred_kpts = torch.cat(pred_kpts, dim=0)

    # pred_joints24_xy = pred_kpts[:, :, :2] * 256

    vid_to_pred_kpts[vid] = pred_kpts.clone().cpu().numpy()

In [None]:
torch.save(vid_to_pred_kpts, "inputs/RICH/eval_support/hybrik_pred_kpts.pth")

In [None]:
torch.cuda.empty_cache()

In [None]:
from hmr4d.utils.vis.vis_kpts import draw_kpts_cv2

imgs_keypoints_overlay = []
for i in range(len(img_crops)):
    img = img_crops[i].copy()
    img_ = draw_kpts_cv2(img, pred_joints24_xy[i].cpu().numpy())
    imgs_keypoints_overlay.append(img_)

# imageio.mimsave("imgs_keypoints_overlay.mp4", imgs_keypoints_overlay, fps=30, quality=6)

### Check WAHM BBX behavior



In [None]:
import joblib
rich_test_vit_path = "inputs/RICH/eval_support/rich_test_vit.pth"
labels = joblib.load(rich_test_vit_path)
rich_video_dir = Path("inputs/RICH/hmr4d_support/video")


vid_to_pred_kpts = {}
for index in tqdm(range(len(labels["vid"]))):
    bbox = labels["bbox"][index][1:]  # (F, 3) # 3: (x, y, s)
    frame_id = labels["frame_id"][index][1:]  # (F, )
    vid = labels["vid"][index]

    decord.bridge.set_bridge("torch") 
    vr = decord.VideoReader(str(rich_video_dir / vid / "video.mp4"))
    frames = vr.get_batch(list(frame_id.numpy()))  # (F, 752, 1024, 3)


    # Crop images according to bbox
    bbox_src = torch.stack(
        [
            torch.stack([bbox[:, 0] - bbox[:, 2] / 2, bbox[:, 1] - bbox[:, 2] / 2], dim=-1),
            torch.stack([bbox[:, 0] + bbox[:, 2] / 2, bbox[:, 1] - bbox[:, 2] / 2], dim=-1),
            bbox[:, :2],
        ],
        dim=1,
    )
    bbox_src = bbox_src.numpy()  # (F, 3, 2)
    dst_size = 256
    bbox_dst = np.array([[0, 0], [dst_size - 1, 0], [dst_size / 2 - 0.5, dst_size / 2 - 0.5]], dtype=np.float32)
    As = [cv2.getAffineTransform(src / 4, bbox_dst) for src in bbox_src]
    img_crops = [
        cv2.warpAffine(frames[i].numpy(), As[i], (dst_size, dst_size), flags=cv2.INTER_LINEAR) for i in range(len(As))
    ]


    out_dir = Path("tmp_wham_bbx_videos")
    out_fn = out_dir / f"{vid.replace('/', '_')}.mp4"
    imageio.mimsave(out_fn, img_crops, fps=30, quality=6)


### Check Kpt Results

In [8]:
rich_dir = Path("inputs/RICH")
kpts = torch.load(rich_dir / "eval_support/hybrik_pred_kpts.pth")

# Bbx information
import joblib
rich_test_vit_path = "inputs/RICH/eval_support/rich_test_vit.pth"
labels = joblib.load(rich_test_vit_path)
rich_video_dir = Path("inputs/RICH/hmr4d_support/video")

index_to_vid = {i: vid for i, vid in enumerate(labels["vid"])}

In [11]:
def get_bbx_images(labels, index, rich_video_dir, dst_size=256):
    bbox = labels["bbox"][index][1:]  # (F, 3) # 3: (x, y, s)
    frame_id = labels["frame_id"][index][1:]  # (F, )
    vid = labels["vid"][index]
    decord.bridge.set_bridge("torch") 
    vr = decord.VideoReader(str(rich_video_dir / vid / "video.mp4"))
    frames = vr.get_batch(list(frame_id.numpy()))  # (F, 752, 1024, 3)  

    # crop images according to bbox
    bbox_src = torch.stack(
        [
            torch.stack([bbox[:, 0] - bbox[:, 2] / 2, bbox[:, 1] - bbox[:, 2] / 2], dim=-1),
            torch.stack([bbox[:, 0] + bbox[:, 2] / 2, bbox[:, 1] - bbox[:, 2] / 2], dim=-1),
            bbox[:, :2],
        ],
        dim=1,
    )
    bbox_src = bbox_src.numpy()  # (F, 3, 2)
    bbox_dst = np.array([[0, 0], [dst_size - 1, 0], [dst_size / 2 - 0.5, dst_size / 2 - 0.5]], dtype=np.float32)
    As = [cv2.getAffineTransform(src / 4, bbox_dst) for src in bbox_src]
    img_crops = [
        cv2.warpAffine(frames[i].numpy(), As[i], (dst_size, dst_size), flags=cv2.INTER_LINEAR) for i in range(len(As))
    ]

    return img_crops

In [14]:
dst_size = 256
index = 20
vid = index_to_vid[index]
pred_kpts = kpts[vid][:, :22, :2] * dst_size # [F, 22, 2]
print(pred_kpts.shape)
img_crops = get_bbx_images(labels, index, rich_video_dir, dst_size=dst_size)

(300, 22, 2)


In [16]:
from hmr4d.utils.vis.vis_kpts import draw_kpts_cv2

imgs_keypoints_overlay = []
for i in range(len(img_crops)):
    img = img_crops[i].copy()
    img_ = draw_kpts_cv2(img, pred_kpts[i])
    imgs_keypoints_overlay.append(img_)


In [18]:
imageio.mimsave("imgs_keypoints_overlay.mp4", imgs_keypoints_overlay, fps=30, quality=6)

In [19]:
vis_out = {"img_crops": img_crops, "pred_kpts": pred_kpts}
np.save("vis_out.npy", vis_out)