In [1]:
from data import *
from pretrained_models import *
from torch.utils.data import DataLoader
import os
from mediapipe.tasks.python.vision.hand_landmarker import HandLandmarkerResult
from tokenizer import *
# from train import *

### Sanity Check

In [10]:
# MediaPipeCFG = MediaPipeCfg("pretrained_model/hand_landmarker.task")
# options = MediaPipeCFG.create_options()
# MP_model = MediaPipeCFG.HandLandmarker.create_from_options(options)

In [11]:
# MMPoseCFG = MMPoseCfg(checkpoint_path='pretrained_model/checkpoint/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth',
#                       config_path='pretrained_model/mmpose_config/rtmpose_m_8xb256-420e_coco-256x192.py')
# body_model = MMPoseCFG.create_model()

In [12]:
# pre_train_dataset = ASLData(
#     video_dir="data/raw_videos",
#     MP_model=MP_model,
#     body_cfg=MMPoseCFG,
#     body_model=body_model,
#     labels_path="data/how2sign_realigned_train.csv",
#     min_frequency=1,
#     max_frames=300,
#     frame_subsample=2,
# )
#
# pre_train_loader = DataLoader(
#     pre_train_dataset,
#     batch_size=8,
#     shuffle=True,
#     num_workers=0,
#     collate_fn=lambda b: asl_collate_func(b, pad_id=pre_train_dataset.pad_id),
# )

In [13]:
# OneSample = next(iter(pre_train_loader))

### Write converted JSON to local dir for training

In [2]:
MediaPipeCFG = MediaPipeCfg("pretrained_model/hand_landmarker.task")
options = MediaPipeCFG.create_options()
MP_model = MediaPipeCFG.HandLandmarker.create_from_options(options)

In [3]:
MMPoseCFG = MMPoseCfg(checkpoint_path='pretrained_model/checkpoint/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth',
                      config_path='pretrained_model/mmpose_config/rtmpose_m_8xb256-420e_coco-256x192.py')
body_model = MMPoseCFG.create_model()

Loads checkpoint by local backend from path: pretrained_model/checkpoint/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth
The model and loaded state dict do not match exactly

size mismatch for backbone.stem.0.conv.weight: copying a param with shape torch.Size([16, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([24, 3, 3, 3]).
size mismatch for backbone.stem.0.bn.weight: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([24]).
size mismatch for backbone.stem.0.bn.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([24]).
size mismatch for backbone.stem.0.bn.running_mean: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([24]).
size mismatch for backbone.stem.0.bn.running_var: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([24]).
size mis

In [4]:
pre_train_dataset = ASLData(
    video_dir="data/raw_videos",
    MP_model=MP_model,
    body_cfg=MMPoseCFG,
    body_model=body_model,
    labels_path="data/how2sign_realigned_train.csv",
    min_frequency=1,
    max_frames=300,
    frame_subsample=2,
)

pre_train_loader = DataLoader(
    pre_train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=0,
    collate_fn=lambda b: asl_collate_func(b, pad_id=pre_train_dataset.pad_id),
)

In [5]:
save_dir = "precomputed_train"
os.makedirs(save_dir, exist_ok=True)

In [None]:
import os
import torch
from torch.utils.data import DataLoader

pre_train_dataset = ASLData(
    video_dir="data/raw_videos",
    MP_model=MP_model,
    body_cfg=MMPoseCFG,
    body_model=body_model,
    labels_path="data/how2sign_realigned_train.csv",
    min_frequency=1,
    max_frames=300,
    frame_subsample=2,
)

pre_train_loader = DataLoader(
    pre_train_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=0,
    collate_fn=lambda b: b[0],
)

# Save vocab meta info once
torch.save(
    {
        "vocab": pre_train_dataset.vocab,
        "pad_id": pre_train_dataset.pad_id,
    },
    os.path.join(save_dir, "vocab_meta.pt"),
)

for idx, sample in enumerate(pre_train_loader):
    # sample is the dict from __getitem__:
    # {
    #   "pose": [T', D],
    #   "pose_len": int,
    #   "label_ids": [L],
    #   "label_len": int,
    #   "filename": str,
    #   "raw_label": str,
    # }

    # we move tensors to CPU just to be safe
    sample_to_save = {
        "features": sample["features"].cpu(),
        "feature_len": int(sample["feature_len"]),
        "label_ids": sample["label_ids"].cpu(),
        "label_len": int(sample["label_len"]),
        "filename": sample["filename"],
        "raw_label": sample["raw_label"],
    }

    out_path = os.path.join(save_dir, f"sample_{idx:05d}.pt")
    torch.save(sample_to_save, out_path)

    if (idx + 1) % 50 == 0:
        print(f"PROGRESS === Saved {idx+1} samples...")


PROGRESS === Saved 50 samples...
PROGRESS === Saved 100 samples...


### Testing model

In [3]:
from glob import glob

class PrecomputedHow2Sign(Dataset):
    def __init__(self, feature_dir: str):
        self.feature_paths = sorted(glob(os.path.join(feature_dir, "sample_*.pt")))
        if not self.feature_paths:
            raise RuntimeError(f"No precomputed samples found in {feature_dir}")

        meta = torch.load(os.path.join(feature_dir, "vocab_meta.pt"))
        self.vocab = meta["vocab"]
        self.pad_id = meta["pad_id"]

    def __len__(self):
        return len(self.feature_paths)

    def __getitem__(self, idx):
        sample = torch.load(self.feature_paths[idx])
        return sample

pre_ds = PrecomputedHow2Sign("data/pre_train_data")

vocab = pre_ds.vocab
pad_id = pre_ds.pad_id
id_to_token = {idx: tok for tok, idx in vocab.items()}
vocab_size = len(vocab)

loader = DataLoader(
    pre_ds,
    batch_size=8,
    shuffle=True,
    num_workers=0,
    collate_fn=lambda b: asl_collate_func(b, pad_id=pad_id),
)

first_batch = next(iter(loader))
feature_dim = first_batch["features"].shape[-1]
#
# model = PoseToTextModel(
#     pose_dim=pose_dim,
#     enc_hidden=256,
#     vocab_size=vocab_size,
#     emb_dim=256,
#     pad_id=pad_id,
# ).to(device)