# Pose Extraction + Annotation

In [1]:
# Copyright (c) CIIS-Lab. All rights reserved.
import os.path as osp

import copy as cp
import tempfile

import cv2
import mmcv
import mmengine
import numpy as np
import torch

from mmaction.apis import (detection_inference,
                           # inference_recognizer, init_recognizer,
                           pose_inference)
from mmaction.registry import VISUALIZERS
from mmaction.utils import frame_extract

import moviepy.editor as mpy

In [None]:
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 1.25

THICKNESS = 2  # int
LINETYPE = 1

In [None]:
def hex2color(h):
    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))

PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
PLATEBLUE = PLATEBLUE.split('-')
PLATEBLUE = [hex2color(h) for h in PLATEBLUE]


def visualize(pose_config,
              frames,
              annotations,
              pose_data_samples,
              action_result,
              plate=PLATEBLUE,
              max_num=5):
    """Visualize frames with predicted annotations.

    Args:
        frames (list[np.ndarray]): Frames for visualization, note that
            len(frames) % len(annotations) should be 0.
        annotations (list[list[tuple]]): The predicted spatio-temporal
            detection results.
        pose_data_samples (list[list[PoseDataSample]): The pose results.
        action_result (str): The predicted action recognition results.
        pose_model (nn.Module): The constructed pose model.
        plate (str): The plate used for visualization. Default: PLATEBLUE.
        max_num (int): Max number of labels to visualize for a person box.
            Default: 5.

    Returns:
        list[np.ndarray]: Visualized frames.
    """

    assert max_num + 1 <= len(plate)
    frames_ = cp.deepcopy(frames)
    frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_]
    nf, na = len(frames), len(annotations)
    assert nf % na == 0
    nfpa = len(frames) // len(annotations)
    anno = None
    h, w, _ = frames[0].shape
    scale_ratio = np.array([w, h, w, h])

    # add pose results
    if pose_data_samples:
        pose_config = mmengine.Config.fromfile(pose_config)
        visualizer = VISUALIZERS.build(pose_config.visualizer | {'line_width':5, 'bbox_color':(101,193,255), 'radius': 8})  # https://mmpose.readthedocs.io/en/latest/api.html#mmpose.visualization.PoseLocalVisualizer
        visualizer.set_dataset_meta(pose_data_samples[0].dataset_meta)
        for i, (d, f) in enumerate(zip(pose_data_samples, frames_)):
            visualizer.add_datasample(
                'result',
                f,
                data_sample=d,
                draw_gt=False,
                draw_heatmap=False,
                draw_bbox=True,
                draw_pred=True,
                show=False,
                wait_time=0,
                out_file=None,
                kpt_thr=0.3)
            frames_[i] = visualizer.get_image()

    for i in range(na):
        anno = annotations[i]
        if anno is None:
            continue
        for j in range(nfpa):
            ind = i * nfpa + j
            frame = frames_[ind]

            # add spatio-temporal action detection results
            for ann in anno:
                box = ann[0]
                label = ann[1]
                if not len(label):
                    continue
                score = ann[2]
                box = (box * scale_ratio).astype(np.int64)
                st, ed = tuple(box[:2]), tuple(box[2:])
                if not pose_data_samples:
                    cv2.rectangle(frame, st, ed, plate[0], 2)

                for k, lb in enumerate(label):
                    if k >= max_num:
                        break
                    text = abbrev(lb)
                    text = ': '.join([text, f'{score[k]:.3f}'])
                    location = (0 + st[0], 18 + k * 18 + st[1])
                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 2)
                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
                    FONTCOLOR = (255, 0, 0)
                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                FONTCOLOR, THICKNESS, LINETYPE)

    return frames_

In [None]:
video = 'data/skeleton/raw/30d_1s1.mp4'
ann_filename = 'data/skeleton/to-anno/30d_1s1.csv'
pkl_filename = 'data/skeleton/to-anno/30d_1s1.pkl'
out_filename = 'data/skeleton/to-anno/30d_1s1_gt.mp4'

# human detection config
det_config = 'mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'
det_checkpoint = 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'
det_score_thr = 0.9
#det_cat_id = 0

# pose estimation config
pose_config = 'mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'

# use skeleton-based method
use_skeleton_stdet = True
use_skeleton_recog = True

# skeleton-based spatio-temporal action classification config
label_map_stdet = "data/skeleton/ciis_label_map.txt"

predict_stepsize = 4  # must even int, give out a spatio-temporal detection prediction per n frames
output_stepsize = 1  # show one frame per n frames in the demo, we should have: predict_stepsize % output_stepsize == 0, speedUp/slowDown video output
output_fps = 12  # the fps of demo video output, will speedUp/slowDown video output, must equal to (video_input_fps/output_stepsize) to get normal speed

device = 'cuda:0'

In [None]:
# def load_label_map(file_path):
#     """Load Label Map.

#     Args:
#         file_path (str): The file path of label map.

#     Returns:
#         dict: The label map (int -> label name).
#     """
#     lines = open(file_path).readlines()
#     lines = [x.strip().split(': ') for x in lines]
#     return {int(x[0]): x[1] for x in lines}


def abbrev(name):
    """Get the abbreviation of label name:

    'take (an object) from (a person)' -> 'take ... from ...'
    """
    while name.find('(') != -1:
        st, ed = name.find('('), name.find(')')
        name = name[:st] + '...' + name[ed + 1:]
    return name

def pack_result(human_detection, result, img_h, img_w):
    """Short summary.

    Args:
        human_detection (np.ndarray): Human detection result.
        result (type): The predicted label of each human proposal.
        img_h (int): The image height.
        img_w (int): The image width.

    Returns:
        tuple: Tuple of human proposal, label name and label score.
    """
    human_detection[:, 0::2] /= img_w
    human_detection[:, 1::2] /= img_h
    results = []
    if result is None:
        return None
    for prop, res in zip(human_detection, result):
        res.sort(key=lambda x: -x[1])
        results.append(
            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
                                                            for x in res]))
    return results


def expand_bbox(bbox, h, w, ratio=1.25):
    x1, y1, x2, y2 = bbox
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2
    width = x2 - x1
    height = y2 - y1

    square_l = max(width, height)
    new_width = new_height = square_l * ratio

    new_x1 = max(0, int(center_x - new_width / 2))
    new_x2 = min(int(center_x + new_width / 2), w)
    new_y1 = max(0, int(center_y - new_height / 2))
    new_y2 = min(int(center_y + new_height / 2), h)
    return (new_x1, new_y1, new_x2, new_y2)


def cal_iou(box1, box2):
    xmin1, ymin1, xmax1, ymax1 = box1
    xmin2, ymin2, xmax2, ymax2 = box2

    s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
    s2 = (xmax2 - xmin2) * (ymax2 - ymin2)

    xmin = max(xmin1, xmin2)
    ymin = max(ymin1, ymin2)
    xmax = min(xmax1, xmax2)
    ymax = min(ymax1, ymax2)

    w = max(0, xmax - xmin)
    h = max(0, ymax - ymin)
    intersect = w * h
    union = s1 + s2 - intersect
    iou = intersect / union

    return iou


# clip_pose_extraction
def skeleton_based_stdet(predict_stepsize, video,
                         # skeleton_config, skeleton_stdet_checkpoint, device, action_score_thr, label_map,
                         human_detections, pose_results, num_frame, clip_len, frame_interval, h, w):
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    # skeleton_config = mmengine.Config.fromfile(skeleton_config)
    # num_class = max(label_map.keys()) + 1  # for AVA dataset (81)
    # skeleton_config.model.cls_head.num_classes = num_class
    # skeleton_stdet_model = init_recognizer(skeleton_config,
    #                                        skeleton_stdet_checkpoint,
    #                                        device)

    skeleton_predictions = []
    skeleton_datasets = []

    print('Building skeleton datasets from existing keypoint data for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    for timestamp in timestamps:  # iterate each clip
        proposal = human_detections[timestamp - 1] # get bboxes for persons in timestamp (first frame of clip)
        if proposal.shape[0] == 0:  # no people detected
            skeleton_predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        num_frame = len(frame_inds)  # 30

        pose_result = [pose_results[ind] for ind in frame_inds]  # grouping frames poses for each clip

        skeleton_prediction = []
        for i in range(proposal.shape[0]):  # num_person  # iterate each bbox in timestamp (first frame of clip)
            skeleton_prediction.append([])

            fake_anno = dict(
                frame_dir=osp.splitext(osp.basename(video))[0]+"_"+str(timestamp+(i+1)*0.001),
                label=-1,
                img_shape=(h, w),
                original_shape=(h, w),
                num_clips=1,
                total_frames=num_frame
            )
            num_person = 1

            num_keypoint = 17
            keypoint = np.zeros(
                (num_person, num_frame, num_keypoint, 2))  # M T V 2
            keypoint_score = np.zeros(
                (num_person, num_frame, num_keypoint))  # M T V

            # pose matching
            person_bbox = proposal[i][:4]  # get bbox for a person in timestamp (first frame of clip)
            area = expand_bbox(person_bbox, h, w)  # bbox expanded by 1.25 ratio with square shape

            for j, poses in enumerate(pose_result):  # num_frame  # iterate each frame of clip
                max_iou = float('-inf')
                index = -1
                if len(poses['keypoints']) == 0:
                    continue
                for k, bbox in enumerate(poses['bboxes']):  # iterate each bbox/pose in each frame
                    iou = cal_iou(bbox, area)  # compare each bbox in each frame with current area (calculate_intersect/union)
                    if max_iou < iou:
                        index = k  # pose from the biggest intersect/union (iou) will be considered
                        max_iou = iou
                keypoint[0, j] = poses['keypoints'][index]
                keypoint_score[0, j] = poses['keypoint_scores'][index]

            fake_anno['keypoint'] = keypoint
            fake_anno['keypoint_score'] = keypoint_score

            skeleton_datasets.append(fake_anno)
            # output = inference_recognizer(skeleton_stdet_model, fake_anno)
            # # for multi-label recognition
            # score = output.pred_score.tolist()
            # for k in range(len(score)):  # 81
            #     if k not in label_map:
            #         continue
            #     if score[k] > action_score_thr:
            #         skeleton_prediction[i].append((label_map[k], score[k]))
            skeleton_prediction[i].append(("annotate!", timestamp + (i+1)*0.001))

        skeleton_predictions.append(skeleton_prediction)
        prog_bar.update()

    return timestamps, skeleton_predictions, skeleton_datasets

In [None]:
#args = parse_args()
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, original_frames = frame_extract(
    video, 720, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape

In [None]:
# get Human detection results
human_detections, _ = detection_inference(
    det_config,
    det_checkpoint,
    frame_paths,
    det_score_thr,
    device=device)
torch.cuda.empty_cache()

# get Pose estimation results
pose_datasample = None
pose_results, pose_datasample = pose_inference(
    pose_config,
    pose_checkpoint,
    frame_paths,
    human_detections,
    device=device)
torch.cuda.empty_cache()

In [None]:
stdet_preds = None

print('Use skeleton-based SpatioTemporal Action Detection')
# clip_len, frame_interval = 30, 1
clip_len, frame_interval = predict_stepsize, 1

# clip_pose_extraction
timestamps, stdet_preds, skeleton_datasets = skeleton_based_stdet(predict_stepsize, video,
                                                                  # skeleton_config,
                                                                  # skeleton_stdet_checkpoint,
                                                                  # device,
                                                                  # action_score_thr,
                                                                  # stdet_label_map,
                                                                  human_detections,
                                                                  pose_results, num_frame,
                                                                  clip_len,
                                                                  frame_interval, h, w)
for i in range(len(human_detections)):
    det = human_detections[i]
    # det[:, 0:4:2] *= w_ratio
    # det[:, 1:4:2] *= h_ratio
    det[:, 0:4:2] *= 1
    det[:, 1:4:2] *= 1
    human_detections[i] = torch.from_numpy(det[:, :4]).to(device)

## Annotation

In [None]:
anno = ""
for clip in stdet_preds:
    if clip == None:
        continue
    for person_attr in clip:
        anno += str(person_attr[0][0]) + "," + str(person_attr[0][1]) + "\n"

with open(ann_filename,'w') as data:
    data.write(anno)

mmengine.dump(skeleton_datasets, pkl_filename)

In [None]:
stdet_results = []
for timestamp, prediction in zip(timestamps, stdet_preds):
    human_detection = human_detections[timestamp - 1]
    stdet_results.append(
        pack_result(human_detection, prediction, h, w))

def dense_timestamps(timestamps, n):
    """Make it nx frames."""
    old_frame_interval = (timestamps[1] - timestamps[0])
    start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
    new_frame_inds = np.arange(
        len(timestamps) * n) * old_frame_interval / n + start
    return new_frame_inds.astype(np.int64)

dense_n = int(predict_stepsize / output_stepsize)
output_timestamps = dense_timestamps(timestamps, dense_n) + 1
frames = [
    cv2.imread(frame_paths[timestamp - 1])
    for timestamp in output_timestamps
]

pose_datasample = [
    pose_datasample[timestamp - 1] for timestamp in output_timestamps
]

In [None]:
vis_frames = visualize(pose_config, frames, stdet_results, pose_datasample,
                       None)
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile(out_filename)
tmp_dir.cleanup()

In [None]:
# frame_extract(
#     out_filename, out_dir="../extracted/")

# Add Label to Datasets

In [None]:
anned_filename = 'data/skeleton/to-anno/30d_1s1_annotated.csv'
pkl_final = 'data/skeleton/to-combine/30d_1s1.pkl'

In [None]:
import csv

def load_label_map(file_path):
    """Load Label Map.

    Args:
        file_path (str): The file path of label map.

    Returns:
        dict: The label map (label name -> int).
    """
    lines = open(file_path).readlines()
    lines = [x.strip().split(': ') for x in lines]
    return {x[1]: int(x[0]) for x in lines}

In [None]:
stdet_label_map = load_label_map(label_map_stdet)

stdet_label_map

In [None]:
custom_annos = []

with open(anned_filename, newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        if row[0] == 'none':
            continue
        label = stdet_label_map[row[0]]
        id = float(row[1])
        custom_annos.append([id, label])

In [None]:
skeleton_datasets = mmengine.load(pkl_filename)

custom_dataset = []

In [None]:
for index, ann in enumerate(custom_annos):

    fake_anno = dict(
        frame_dir=osp.splitext(osp.basename(pkl_filename))[0]+"_"+str(ann[0]))

    for j, data in enumerate(skeleton_datasets):
        if fake_anno['frame_dir'] == data['frame_dir']:
            fake_anno['frame_dir'] += '_' + str(index)
            fake_anno['label'] = ann[1]
            fake_anno['img_shape'] = data['img_shape']
            fake_anno['original_shape'] = data['original_shape']
            fake_anno['num_clips'] = 1
            fake_anno['total_frames'] = data['total_frames']
            fake_anno['clip_len'] = data['total_frames']
            fake_anno['keypoint'] = data['keypoint']
            fake_anno['keypoint_score'] = data['keypoint_score']

    custom_dataset.append(fake_anno)

In [None]:
mmengine.dump(custom_dataset, pkl_final)

# Visualize

In [None]:
import csv

def visualize(pose_config,
              frames,
              annotations,
              pose_data_samples,
              action_result,
              plate=PLATEBLUE,
              max_num=5):
    """Visualize frames with predicted annotations.

    Args:
        frames (list[np.ndarray]): Frames for visualization, note that
            len(frames) % len(annotations) should be 0.
        annotations (list[list[tuple]]): The predicted spatio-temporal
            detection results.
        pose_data_samples (list[list[PoseDataSample]): The pose results.
        action_result (str): The predicted action recognition results.
        pose_model (nn.Module): The constructed pose model.
        plate (str): The plate used for visualization. Default: PLATEBLUE.
        max_num (int): Max number of labels to visualize for a person box.
            Default: 5.

    Returns:
        list[np.ndarray]: Visualized frames.
    """

    assert max_num + 1 <= len(plate)
    frames_ = cp.deepcopy(frames)
    frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_]
    nf, na = len(frames), len(annotations)
    assert nf % na == 0
    nfpa = len(frames) // len(annotations)
    anno = None
    h, w, _ = frames[0].shape
    scale_ratio = np.array([w, h, w, h])

    # add pose results
    if pose_data_samples:
        pose_config = mmengine.Config.fromfile(pose_config)
        visualizer = VISUALIZERS.build(pose_config.visualizer | {'line_width':5, 'bbox_color':(101,193,255), 'radius': 8})  # https://mmpose.readthedocs.io/en/latest/api.html#mmpose.visualization.PoseLocalVisualizer
        visualizer.set_dataset_meta(pose_data_samples[0].dataset_meta)
        for i, (d, f) in enumerate(zip(pose_data_samples, frames_)):
            visualizer.add_datasample(
                'result',
                f,
                data_sample=d,
                draw_gt=False,
                draw_heatmap=False,
                draw_bbox=True,
                draw_pred=True,
                show=False,
                wait_time=0,
                out_file=None,
                kpt_thr=0.3)
            frames_[i] = visualizer.get_image()

    for i in range(na):
        anno = annotations[i]
        if anno is None:
            continue
        for j in range(nfpa):
            ind = i * nfpa + j
            frame = frames_[ind]

            # add spatio-temporal action detection results
            for ann in anno:
                box = ann[0]
                label = ann[1]
                if not len(label):
                    continue
                score = ann[2]
                box = (box * scale_ratio).astype(np.int64)
                st, ed = tuple(box[:2]), tuple(box[2:])
                if not pose_data_samples:
                    cv2.rectangle(frame, st, ed, plate[0], 2)

                for k, lb in enumerate(label):
                    if k >= max_num:
                        break
                    text = abbrev(lb)
                    # text = ': '.join([text, f'{(score[k]*100):.1f}%'])  # to add score
                    location = (0 + st[0], 18 + k * 18 + st[1])
                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 2)
                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
                    bahaya = ['melempar', 'membidik senapan', 'membidik pistol', 'memukul', 'menendang', 'menusuk']
                    FONTCOLOR = (255, 0, 0) if lb in bahaya else (255, 255, 255)
                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                FONTCOLOR, THICKNESS, LINETYPE)

    return frames_

In [None]:
stdet_preds_gt = list([])

# timestamps == ambil dari atas
# stdet_preds == ambil dari atas

for timestamp, stdet_pred in zip(timestamps, stdet_preds):
    timestmp_anno = list([])
    if stdet_pred != None:
        for i, object in enumerate(stdet_pred):
            object_anno = list([])
            with open(anned_filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    if row[0] == 'none':
                        continue
                    elif float(row[1]) == (float(timestamp) + (i+1)*0.001):
                        object_anno.append(tuple([row[0], np.random.uniform(0.4, 1.0)]))
            timestmp_anno.append(object_anno)
    stdet_preds_gt.append(timestmp_anno)

# stdet_preds = list[list[tuple]]
# [
#     [[(), ()],[()],[]],
#     [[],[]]
# ]

In [None]:
stdet_results = []

# human_detections == ambil dari atas
# new_h, new_w == ambil dari atas
# predict_stepsize == ambil dari atas
# output_stepsize == ambil dari atas
# frame_paths == ambil dari atas
# pose_datasample == ambil dari atas

for timestamp, prediction in zip(timestamps, stdet_preds_gt):
    human_detection = human_detections[timestamp - 1]
    stdet_results.append(
        pack_result(human_detection, prediction, h, w))

def dense_timestamps(timestamps, n):
    """Make it nx frames."""
    old_frame_interval = (timestamps[1] - timestamps[0])
    start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
    new_frame_inds = np.arange(
        len(timestamps) * n) * old_frame_interval / n + start
    return new_frame_inds.astype(np.int64)

dense_n = int(predict_stepsize / output_stepsize)
# output_timestamps = dense_timestamps(timestamps, dense_n)
output_timestamps = dense_timestamps(timestamps, dense_n) + 1
frames = [
    cv2.imread(frame_paths[timestamp - 1])
    # cv2.imread("../854x480-white-solid-color-background.jpg")
    for timestamp in output_timestamps
]

pose_datasample = [
    pose_datasample[timestamp - 1] for timestamp in output_timestamps
]

In [None]:
# pose_config == ambil dari atas
# frames == ambil dari atas

# pose_datasample == ambil dari atas

vis_frames = visualize(pose_config, frames, stdet_results, pose_datasample,
                       None)
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile(out_filename)
tmp_dir.cleanup()

# Combine PKL

In [2]:
import os

In [3]:
pickles_path = 'data/skeleton/to-combine'
split_ratio = 0.5
combined_pkl = 'data/skeleton/ciis_' + (str(split_ratio)).replace(".", "s") + '_v1.pkl'

In [4]:
custom_datasets = dict(split=dict(xsub_train=[],
                                  xsub_val=[],
                                  xview_train=[],
                                  xview_val=[]),
                       annotations=[])

In [5]:
for file in os.listdir(pickles_path):
    print(file)
    if not file.endswith('.pkl'):
        continue
    custom_dataset = mmengine.load(os.path.join(pickles_path, file))
    for i, data in enumerate(custom_dataset):
        custom_datasets['annotations'].append(data)
        if (i % 10) < (split_ratio * 10):
            custom_datasets['split']['xsub_train'].append(data['frame_dir'])
            custom_datasets['split']['xview_train'].append(data['frame_dir'])
        else:
            custom_datasets['split']['xsub_val'].append(data['frame_dir'])
            custom_datasets['split']['xview_val'].append(data['frame_dir'])

70d_1s2.pkl
50d_1s1.pkl
70d_1s3.pkl
50d_1s3.pkl
30d_1s3.pkl
50d_1s2.pkl
30d_1s2.pkl
30d_1s1.pkl
70d_1s1.pkl


In [6]:
mmengine.dump(custom_datasets, combined_pkl)