In [None]:
from IPython.display import HTML

## Testing recognition from bash - DEMO

In [None]:
!python mmaction2/demo/demo_skeleton.py data/uji_jalan.mp4 data/uji_jalan_out_demo.mp4 \
    --config mmaction2/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
    --checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu60_xsub_keypoint/slowonly_r50_u48_240e_ntu60_xsub_keypoint-f3adabf1.pth \
    --det-config mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
    --det-score-thr 0.9 \
    --det-cat-id 0 \
    --pose-config mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
    --label-map mmaction2/tools/data/skeleton/label_map_ntu60.txt

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/uji_jalan.mp4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/uji_jalan_out_demo.mp4"></video>')

## Testing recognition from python - DEMO

In [None]:
import tempfile

import cv2
import mmcv
import mmengine
import torch
from mmengine.utils import track_iter_progress

from mmaction.apis import (detection_inference, inference_skeleton,
                           init_recognizer, pose_inference)
from mmaction.registry import VISUALIZERS
from mmaction.utils import frame_extract

import moviepy.editor as mpy

In [None]:
import moviepy.config as cf
print ( cf.get_setting("FFMPEG_BINARY") ) # prints the current setting, make sure to use imageio_ffmpeg

In [None]:
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 1.25
FONTCOLOR = (255, 255, 255)  # BGR, white
THICKNESS = 2
LINETYPE = 1

def visualize(pose_config, out_filename, frames, data_samples, action_label):
    pose_config = mmengine.Config.fromfile(pose_config)
    visualizer = VISUALIZERS.build(pose_config.visualizer | {'line_width':5, 'bbox_color':(101,193,255), 'radius': 8})  # https://mmpose.readthedocs.io/en/latest/api.html#mmpose.visualization.PoseLocalVisualizer
    visualizer.set_dataset_meta(data_samples[0].dataset_meta)

    vis_frames = []
    print('Drawing skeleton for each frame')
    for d, f in track_iter_progress(list(zip(data_samples, frames))):
        f = mmcv.imconvert(f, 'bgr', 'rgb')
        visualizer.add_datasample(
            'result',
            f,
            data_sample=d,
            draw_gt=False,
            draw_heatmap=True,
            draw_bbox=True,
            show=False,
            wait_time=0,
            out_file=None,
            kpt_thr=0.3)
        vis_frame = visualizer.get_image()
        # heatmap = visualizer.draw_featmap(featmap, img, channel_reduction='select_max')
        bahaya = ['pistol', 'laras_panjang']
        cv2.putText(vis_frame, action_label, (10, 30), FONTFACE, FONTSCALE,
                    FONTCOLOR, THICKNESS, LINETYPE)
        vis_frames.append(vis_frame)

    vid = mpy.ImageSequenceClip(vis_frames, fps=12)
    vid.write_videofile(out_filename, remove_temp=True)
    # vid.ipython_display()

In [None]:
video = '../cut/DJI_0013_12r_10s_2.mp4'
out_filename = 'data/DJI_0013_12r_10s_2_2019.mp4'

# Choose to use an action classification config
config = 'mmaction2/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py'
# Setup a checkpoint file to load
checkpoint = 'https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu60_xsub_keypoint/slowonly_r50_u48_240e_ntu60_xsub_keypoint-f3adabf1.pth'

# human detection config
det_config = 'mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'
det_checkpoint = 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'
det_score_thr = 0.9
det_cat_id = 0

# pose estimation config
pose_config = 'mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'
label_map = 'mmaction2/tools/data/skeleton/label_map_ntu60.txt'

In [None]:
device = 'cuda:0'
short_side = 720

In [None]:
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, frames = frame_extract(video, short_side,
                                    tmp_dir.name)

h, w, _ = frames[0].shape

In [None]:
# Get Human detection results.
det_results, _ = detection_inference(det_config, det_checkpoint,
                                     frame_paths, det_score_thr,
                                     det_cat_id, device)
torch.cuda.empty_cache()

In [None]:
# Get Pose estimation results.
pose_results, pose_data_samples = pose_inference(pose_config,
                                                 pose_checkpoint,
                                                 frame_paths, det_results,
                                                 device)
torch.cuda.empty_cache()

In [None]:
# Initialize the recognizer
config = mmengine.Config.fromfile(config)
# build the model from a config file and a checkpoint file
model = init_recognizer(config, checkpoint, device)

# Get Action classification results.
result = inference_skeleton(model, pose_results, (h, w))

In [None]:
# find the index of highest predicted score on result
max_pred_index = result.pred_score.argmax().item()

label_map = [x.strip() for x in open(label_map).readlines()]

# set the highest predicted label as action_label
action_label = label_map[max_pred_index]
print(action_label)

In [None]:
visualize(pose_config, out_filename, frames, pose_data_samples, 'jalan')

tmp_dir.cleanup()

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/uji_jalan.mp4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/uji_jalan_out_demo.mp4"></video>')

## Testing recognition from bash - 2019

In [None]:
!python mmaction2/demo/demo_skeleton.py data/uji_jalan.mp4 data/uji_jalan_out_2019.mp4 \
    --config mmaction2/configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
    --checkpoint work_dirs/slowonly_r50_u48_240e_ntu120_xsub_keypoint/best_top1_acc_epoch_90_8.pth \
    --det-config mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
    --det-score-thr 0.9 \
    --det-cat-id 0 \
    --pose-config mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
    --label-map mmaction2/tools/data/skeleton/label_5.txt

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/uji_jalan.mp4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/uji_jalan_out_2019.mp4"></video>')

## Testing recognition from python - 2019

In [None]:
import tempfile

import cv2
import mmcv
import mmengine
import torch
from mmengine.utils import track_iter_progress

from mmaction.apis import (detection_inference, inference_skeleton,
                           init_recognizer, pose_inference)
from mmaction.registry import VISUALIZERS
from mmaction.utils import frame_extract

import moviepy.editor as mpy

In [None]:
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 1.25
FONTCOLOR = (255, 255, 255)  # BGR, white
MSGCOLOR = (128, 128, 128)  # BGR, gray
THICKNESS = 2  # int
LINETYPE = 1

def hex2color(h):
    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))


PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
PLATEBLUE = PLATEBLUE.split('-')
PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
PLATEGREEN = PLATEGREEN.split('-')
PLATEGREEN = [hex2color(h) for h in PLATEGREEN]

In [None]:
def visualize(pose_config, out_filename, frames, data_samples, action_label, plate=PLATEBLUE):
    pose_config = mmengine.Config.fromfile(pose_config)
    visualizer = VISUALIZERS.build(pose_config.visualizer | {'line_width':5, 'bbox_color':(101,193,255), 'radius': 8})  # https://mmpose.readthedocs.io/en/latest/api.html#mmpose.visualization.PoseLocalVisualizer
    visualizer.set_dataset_meta(data_samples[0].dataset_meta)

    vis_frames = []
    print('Drawing skeleton for each frame')
    for d, f in track_iter_progress(list(zip(data_samples, frames))):
        f = mmcv.imconvert(f, 'bgr', 'rgb')
        visualizer.add_datasample(
            'result',
            f,
            data_sample=d,
            draw_gt=False,
            draw_heatmap=False,
            draw_bbox=True,
            show=False,
            wait_time=0,
            out_file=None,
            kpt_thr=0.3)
        vis_frame = visualizer.get_image()
        textsize = cv2.getTextSize(action_label, FONTFACE, FONTSCALE,
                                   THICKNESS)[0]
        
        textwidth = textsize[0]
        location = (10, 14)
        diag0 = (location[0] + textwidth, location[1] - 14)
        diag1 = (location[0], location[1] + 22)
        cv2.rectangle(vis_frame, diag0, diag1, plate[1], -1)
        cv2.putText(vis_frame, action_label, (10, 30), FONTFACE, FONTSCALE,
                    FONTCOLOR, THICKNESS, LINETYPE)
        vis_frames.append(vis_frame)

    # vid = mpy.ImageSequenceClip(vis_frames, fps=24)
    vid = mpy.ImageSequenceClip(vis_frames, fps=12)
    vid.write_videofile(out_filename, remove_temp=True)

In [None]:
video = '../0516_20s.mp4'
out_filename = '../0516_20s_out_2019.mp4'

# Choose to use an action classification config
config = 'mmaction2/configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py'
# Setup a checkpoint file to load
checkpoint = 'work_dirs/slowonly_r50_u48_240e_ntu120_xsub_keypoint/best_top1_acc_epoch_90_8.pth' #class 5 label

# human detection config
det_config = 'mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'
det_checkpoint = 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'
det_score_thr = 0.9
det_cat_id = 0

# pose estimation config
pose_config = 'mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'
label_map = 'mmaction2/tools/data/skeleton/label_5.txt'

In [None]:
device = 'cuda:0'
short_side = 720

In [None]:
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, frames = frame_extract(video, short_side,
                                    tmp_dir.name)

h, w, _ = frames[0].shape

In [None]:
# Get Human detection results.
det_results, _ = detection_inference(det_config, det_checkpoint,
                                     frame_paths, det_score_thr,
                                     det_cat_id, device)
torch.cuda.empty_cache()

In [None]:
# Get Pose estimation results.
pose_results, pose_data_samples = pose_inference(pose_config,
                                                 pose_checkpoint,
                                                 frame_paths, det_results,
                                                 device)
torch.cuda.empty_cache()

In [None]:
# Initialize the recognizer
config = mmengine.Config.fromfile(config)
# build the model from a config file and a checkpoint file
model = init_recognizer(config, checkpoint, device)

# Get Action classification results.
result = inference_skeleton(model, pose_results, (h, w))

In [None]:
# config = 'mmaction2/configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py'
# checkpoint = "work_dirs/slowonly_r50_u48_240e_ntu120_xsub_keypoint/best_top1_acc_epoch_90_8.pth"

config = mmengine.Config.fromfile(config)

# build the model from a config file and a checkpoint file
model = init_recognizer(config, checkpoint, device)

# Get Action classification results.
result = inference_skeleton(model, pose_results, (h, w))

In [None]:
max_pred_index

In [None]:
label_map

In [None]:
result

In [None]:
# find the index of highest predicted score on result
max_pred_index = result.pred_score.argmax().item()

label_map = [x.strip() for x in open(label_map).readlines()]

# set the highest predicted label as action_label
action_label = label_map[max_pred_index]
print(action_label)

In [None]:
visualize(pose_config, out_filename, frames, pose_data_samples, 'berdiri')

tmp_dir.cleanup()

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/video_out.mp4"></video>')

## Testing recognition + detection from bash - DEMO

In [None]:
!python mmaction2/demo/demo_video_structuralize.py \
    --video mmaction2/demo/test_video_structuralize.mp4 \
    --out-filename mmaction2/demo/test_stdet_recognition_output.mp4 \
\
    --rgb-stdet-config mmaction2/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
    --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
\
    --det-config mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
\
    --pose-config mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
\
    --skeleton-config mmaction2/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
    --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth \
\
    --use-skeleton-stdet \
    --use-skeleton-recog \
\
    --label-map-stdet mmaction2/tools/data/ava/label_map.txt \
    --label-map mmaction2/tools/data/kinetics/label_map_k400.txt

In [None]:
HTML('<video width=50% controls autoplay loop><source src="mmaction2/demo/test_video_structuralize.mp4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="mmaction2/demo/test_stdet_recognition_output.mp4"></video>')

In [None]:
!python mmaction2/demo/demo_video_structuralize.py \
    --video mmaction2/demo/demo_skeleton.mp4 \
    --out-filename mmaction2/demo/demo_skeleton_vs_out.mp4 \
    --rgb-stdet-config mmaction2/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
    --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
    --det-config mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
    --pose-config mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py \
    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
    --skeleton-config mmaction2/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py \
    --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth \
    --use-skeleton-stdet \
    --use-skeleton-recog \
    --label-map-stdet mmaction2/tools/data/ava/label_map.txt \
    --label-map mmaction2/tools/data/kinetics/label_map_k400.txt

In [None]:
HTML('<video width=50% controls autoplay loop><source src="mmaction2/demo/demo_skeleton.mp4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="mmaction2/demo/demo_skeleton_vs_out.mp4"></video>')

## POSEC3D_AVA (Progress: evaluate ava_30f_50x_22.py model)

In [None]:
config = 'mmaction2/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py'
checkpoint = 'https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth'
device = 'cuda:0'

# build the model from a config file and a checkpoint file
model = init_recognizer(config, checkpoint, device)

In [None]:
model.eval()

In [None]:
model = init_recognizer(skeleton_config, skeleton_stdet_checkpoint, device)

model.eval()

In [None]:
model

In [None]:
line_number = 1
with open("mmaction2/tools/data/skeleton/label_map_ntu60.txt", "r", encoding="utf-8") as file:
    for line in file:
        print(str(line_number) + ": " + line)
        line_number += 1

## Testing recognition + detection from python - SHOW

In [None]:
# Copyright (c) OpenMMLab. All rights reserved.
# import argparse
import copy as cp
import tempfile
import warnings

import cv2
import mmcv
import mmengine
import numpy as np
import torch
from mmengine import DictAction
from mmengine.structures import InstanceData

from mmaction.apis import (detection_inference, inference_recognizer,
                           inference_skeleton, init_recognizer, pose_inference)
from mmaction.registry import VISUALIZERS
from mmaction.structures import ActionDataSample
from mmaction.utils import frame_extract

from mmdet.apis import init_detector
# try:
#     from mmdet.apis import init_detector
# except (ImportError, ModuleNotFoundError):
#     warnings.warn('Failed to import `init_detector` form `mmdet.apis`. '
#                   'These apis are required in skeleton-based applications! ')

import moviepy.editor as mpy
# try:
#     import moviepy.editor as mpy
# except ImportError:
#     raise ImportError('Please install moviepy to enable output file')

In [None]:
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 1.25
FONTCOLOR = (255, 255, 255)  # BGR, white
MSGCOLOR = (128, 128, 128)  # BGR, gray
THICKNESS = 2  # int
LINETYPE = 1

In [None]:
def hex2color(h):
    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))


PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
PLATEBLUE = PLATEBLUE.split('-')
PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
PLATEGREEN = PLATEGREEN.split('-')
PLATEGREEN = [hex2color(h) for h in PLATEGREEN]


def visualize(pose_config,
              frames,
              annotations,
              pose_data_samples,
              action_result,
              plate=PLATEBLUE,
              max_num=5):
    """Visualize frames with predicted annotations.

    Args:
        frames (list[np.ndarray]): Frames for visualization, note that
            len(frames) % len(annotations) should be 0.
        annotations (list[list[tuple]]): The predicted spatio-temporal
            detection results.
        pose_data_samples (list[list[PoseDataSample]): The pose results.
        action_result (str): The predicted action recognition results.
        pose_model (nn.Module): The constructed pose model.
        plate (str): The plate used for visualization. Default: PLATEBLUE.
        max_num (int): Max number of labels to visualize for a person box.
            Default: 5.

    Returns:
        list[np.ndarray]: Visualized frames.
    """

    act_res = 'tidak berbahaya'
    
    assert max_num + 1 <= len(plate)
    frames_ = cp.deepcopy(frames)
    frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_]
    nf, na = len(frames), len(annotations)
    assert nf % na == 0
    nfpa = len(frames) // len(annotations)
    anno = None
    h, w, _ = frames[0].shape
    scale_ratio = np.array([w, h, w, h])

    # add pose results
    if pose_data_samples:
        pose_config = mmengine.Config.fromfile(pose_config)
        visualizer = VISUALIZERS.build(pose_config.visualizer | {'line_width':5, 'bbox_color':(101,193,255), 'radius': 8})  # https://mmpose.readthedocs.io/en/latest/api.html#mmpose.visualization.PoseLocalVisualizer
        visualizer.set_dataset_meta(pose_data_samples[0].dataset_meta)
        for i, (d, f) in enumerate(zip(pose_data_samples, frames_)):
            visualizer.add_datasample(
                'result',
                f,
                data_sample=d,
                draw_gt=False,
                draw_heatmap=False,
                draw_bbox=True,
                draw_pred=True,
                show=False,
                wait_time=0,
                out_file=None,
                kpt_thr=0.3)
            frames_[i] = visualizer.get_image()
            # bahaya = ['kekerasan bersenjata', 'kekerasan fisik', 'berbahaya']
            # if action_result in bahaya:
            #     cv2.putText(frames_[i], action_result, (10, 30), FONTFACE,
            #                 FONTSCALE, (255, 0, 0), THICKNESS, LINETYPE)
            # else:
            #     cv2.putText(frames_[i], action_result, (10, 30), FONTFACE,
            #                 FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)

    for i in range(na):
        anno = annotations[i]
        if anno is None:
            continue
        for j in range(nfpa):
            ind = i * nfpa + j
            frame = frames_[ind]

            # add action result for whole video
            # textsize = cv2.getTextSize(action_result, FONTFACE, FONTSCALE,
            #                            THICKNESS)[0]
            
            # textwidth = textsize[0]
            # location = (10, 14)
            # diag0 = (location[0] + textwidth, location[1] - 14)
            # diag1 = (location[0], location[1] + 22)
            # cv2.rectangle(frame, diag0, diag1, (0, 119, 182), -1)
            # bahaya = ['kekerasan bersenjata', 'kekerasan fisik', 'berbahaya']
            # if action_result in bahaya:
            #     cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
            #                 (255, 0, 0), THICKNESS, LINETYPE)
            # else:
            #     cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
            #                 FONTCOLOR, THICKNESS, LINETYPE)

            # add spatio-temporal action detection results
            for ann in anno:
                box = ann[0]
                label = ann[1]
                if not len(label):
                    continue
                score = ann[2]
                box = (box * scale_ratio).astype(np.int64)
                st, ed = tuple(box[:2]), tuple(box[2:])
                if not pose_data_samples:
                    cv2.rectangle(frame, st, ed, plate[0], 2)

                for k, lb in enumerate(label):
                    if k >= max_num:
                        break
                    text = abbrev(lb)
                    text = ': '.join([text, f'{(score[k]*100):.1f}%'])
                    location = (0 + st[0], 18 + k * 18 + st[1])
                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 2)
                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
                    bahaya = ['melempar', 'membidik senapan', 'membidik pistol', 'memukul', 'menendang', 'menusuk']
                    if lb in bahaya:
                        cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                    (255, 0, 0), THICKNESS, LINETYPE)
                        act_res = 'berbahaya'
                    else:
                        cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                    FONTCOLOR, THICKNESS, LINETYPE)

                    textsize = cv2.getTextSize(act_res, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    location = (10, 14)
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 22)
                    cv2.rectangle(frame, diag0, diag1, (0, 119, 182), -1)
                    if act_res == 'berbahaya':
                        cv2.putText(frame, act_res, (10, 30), FONTFACE, FONTSCALE,
                                    (255, 0, 0), THICKNESS, LINETYPE)
                    else:
                        cv2.putText(frame, act_res, (10, 30), FONTFACE, FONTSCALE,
                                    FONTCOLOR, THICKNESS, LINETYPE)

    return frames_

In [None]:
video = '../cut/DJI_0012_12r_10s_4.mp4'
out_filename = '../cut/DJI_0012_12r_10s_4_skbg.mp4'

# rgb-based spatio temporal detection config
rgb_stdet_config =  "mmaction2/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py"
rgb_stdet_checkpoint = "https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth"

# human detection config
det_config = 'mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'
det_checkpoint = 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'
det_score_thr = 0.9
#det_cat_id = 0

# pose estimation config
pose_config = 'mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'

# action classification config
skeleton_config = "mmaction2/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py"
action_score_thr = 0.6
# skeleton-based action recognition checkpoint
skeleton_checkpoint = "https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth"
# skeleton-based spatio temporal detection checkpoint
skeleton_stdet_checkpoint = "work_dirs/ciis_not-multi_10_best-550/best_acc_top1_epoch_550.pth"

# use skeleton-based method
use_skeleton_stdet = True
use_skeleton_recog = True

label_map_stdet = "mmaction2/tools/data/ciis/label_map.txt"
label_map = "mmaction2/tools/data/kinetics/label_map_k400.txt"


# rgb-based action recognition config
rgb_config = "configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py"
rgb_checkpoint = "https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth"

predict_stepsize = 4  # must even int, give out a spatio-temporal detection prediction per n frames
output_stepsize = 1  # show one frame per n frames in the demo, we should have: predict_stepsize % output_stepsize == 0, speedUp/slowDown video output
output_fps = 12  # the fps of demo video output, will speedUp/slowDown video output, must equal to (video_input_fps/output_stepsize) to get normal speed

In [None]:
device = 'cuda:0'

In [None]:
# def parse_args():
#     parser = argparse.ArgumentParser(description='MMAction2 demo')
#     parser.add_argument(
#         '--cfg-options',
#         nargs='+',
#         action=DictAction,
#         default={},
#         help='override some settings in the used config, the key-value pair '
#         'in xxx=yyy format will be merged into config file. For example, '
#         "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
#     args = parser.parse_args()
#     return args

In [None]:
def load_label_map(file_path):
    """Load Label Map.

    Args:
        file_path (str): The file path of label map.

    Returns:
        dict: The label map (int -> label name).
    """
    lines = open(file_path).readlines()
    lines = [x.strip().split(': ') for x in lines]
    return {int(x[0]): x[1] for x in lines}


def abbrev(name):
    """Get the abbreviation of label name:

    'take (an object) from (a person)' -> 'take ... from ...'
    """
    while name.find('(') != -1:
        st, ed = name.find('('), name.find(')')
        name = name[:st] + '...' + name[ed + 1:]
    return name


def pack_result(human_detection, result, img_h, img_w):
    """Short summary.

    Args:
        human_detection (np.ndarray): Human detection result.
        result (type): The predicted label of each human proposal.
        img_h (int): The image height.
        img_w (int): The image width.

    Returns:
        tuple: Tuple of human proposal, label name and label score.
    """
    human_detection[:, 0::2] /= img_w
    human_detection[:, 1::2] /= img_h
    results = []
    if result is None:
        return None
    for prop, res in zip(human_detection, result):
        res.sort(key=lambda x: -x[1])
        results.append(
            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
                                                            for x in res]))
    return results


def expand_bbox(bbox, h, w, ratio=1.25):
    x1, y1, x2, y2 = bbox
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2
    width = x2 - x1
    height = y2 - y1

    square_l = max(width, height)
    new_width = new_height = square_l * ratio

    new_x1 = max(0, int(center_x - new_width / 2))
    new_x2 = min(int(center_x + new_width / 2), w)
    new_y1 = max(0, int(center_y - new_height / 2))
    new_y2 = min(int(center_y + new_height / 2), h)
    return (new_x1, new_y1, new_x2, new_y2)


def cal_iou(box1, box2):
    xmin1, ymin1, xmax1, ymax1 = box1
    xmin2, ymin2, xmax2, ymax2 = box2

    s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
    s2 = (xmax2 - xmin2) * (ymax2 - ymin2)

    xmin = max(xmin1, xmin2)
    ymin = max(ymin1, ymin2)
    xmax = min(xmax1, xmax2)
    ymax = min(ymax1, ymax2)

    w = max(0, xmax - xmin)
    h = max(0, ymax - ymin)
    intersect = w * h
    union = s1 + s2 - intersect
    iou = intersect / union

    return iou


def skeleton_based_action_recognition(skeleton_config, skeleton_checkpoint, device, label_map, pose_results, h, w):
    label_map = [x.strip() for x in open(label_map).readlines()]
    num_class = len(label_map)

    skeleton_config = mmengine.Config.fromfile(skeleton_config)
    skeleton_config.model.cls_head.num_classes = num_class  # for K400 dataset

    skeleton_model = init_recognizer(
        skeleton_config, skeleton_checkpoint, device=device)
    result = inference_skeleton(skeleton_model, pose_results, (h, w))
    action_idx = result.pred_score.argmax().item()
    return label_map[action_idx]


def rgb_based_action_recognition(rgb_config, rgb_checkpoint, device, video, label_map):
    rgb_config = mmengine.Config.fromfile(rgb_config)
    rgb_config.model.backbone.pretrained = None
    rgb_model = init_recognizer(rgb_config, rgb_checkpoint, device)
    action_results = inference_recognizer(rgb_model, video)
    rgb_action_result = action_results.pred_score.argmax().item()
    label_map = [x.strip() for x in open(label_map).readlines()]
    return label_map[rgb_action_result]

def skeleton_based_stdet(predict_stepsize, skeleton_config, skeleton_stdet_checkpoint, device, action_score_thr, label_map, human_detections, pose_results,
                         num_frame, clip_len, frame_interval, h, w):
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    skeleton_config = mmengine.Config.fromfile(skeleton_config)
    num_class = max(label_map.keys()) + 1  # for AVA dataset (80 + 1), for CIIS dataset (9 + 1) == len(label_map)
    skeleton_config.model.cls_head.num_classes = num_class
    skeleton_stdet_model = init_recognizer(skeleton_config,
                                           skeleton_stdet_checkpoint,
                                           device)

    # skeleton_stdet_model.eval()
    skeleton_predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:  # no people detected
            skeleton_predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        num_frame = len(frame_inds)  # 30

        pose_result = [pose_results[ind] for ind in frame_inds]

        skeleton_prediction = []
        for i in range(proposal.shape[0]):  # num_person
            skeleton_prediction.append([])

            fake_anno = dict(
                frame_dict='',
                label=-1,
                img_shape=(h, w),
                origin_shape=(h, w),
                start_index=0,
                modality='Pose',
                num_clips=1,
                clip_len=clip_len,
                total_frames=num_frame)
            num_person = 1

            num_keypoint = 17
            keypoint = np.zeros(
                (num_person, num_frame, num_keypoint, 2))  # M T V 2
            keypoint_score = np.zeros(
                (num_person, num_frame, num_keypoint))  # M T V

            # pose matching
            person_bbox = proposal[i][:4]
            area = expand_bbox(person_bbox, h, w)

            for j, poses in enumerate(pose_result):  # num_frame
                max_iou = float('-inf')
                index = -1
                if len(poses['keypoints']) == 0:
                    continue
                for k, bbox in enumerate(poses['bboxes']):  # num_person
                    iou = cal_iou(bbox, area)
                    if max_iou < iou:  # if isBelong
                        index = k
                        max_iou = iou
                keypoint[0, j] = poses['keypoints'][index]
                keypoint_score[0, j] = poses['keypoint_scores'][index]

            fake_anno['keypoint'] = keypoint
            fake_anno['keypoint_score'] = keypoint_score

            output = inference_recognizer(skeleton_stdet_model, fake_anno)
            # for multi-label recognition
            score = output.pred_score.tolist()
            for k in range(len(score)):  # 81
                if k not in label_map:
                    continue
                if score[k] > action_score_thr:
                    skeleton_prediction[i].append((label_map[k], score[k]))
                    
            # crop the image -> resize -> extract pose -> as input for poseC3D

        skeleton_predictions.append(skeleton_prediction)
        prog_bar.update()

    return timestamps, skeleton_predictions


def rgb_based_stdet(rgb_stdet_config, rgb_stdet_checkpoint, device, action_score_thr, predict_stepsize, frames, label_map, human_detections, w, h, new_w,
                    new_h, w_ratio, h_ratio):

    rgb_stdet_config = mmengine.Config.fromfile(rgb_stdet_config)
    # rgb_stdet_config.merge_from_dict(args.cfg_options)

    val_pipeline = rgb_stdet_config.val_pipeline
    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'

    window_size = clip_len * frame_interval
    num_frame = len(frames)
    # Note that it's 1 based here
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    # Get img_norm_cfg
    img_norm_cfg = dict(
        mean=np.array(rgb_stdet_config.model.data_preprocessor.mean),
        std=np.array(rgb_stdet_config.model.data_preprocessor.std),
        to_rgb=False)

    # Build STDET model
    try:
        # In our spatiotemporal detection demo, different actions should have
        # the same number of bboxes.
        rgb_stdet_config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
    except KeyError:
        pass

    rgb_stdet_config.model.backbone.pretrained = None
    rgb_stdet_model = init_detector(
        rgb_stdet_config, rgb_stdet_checkpoint, device=device)

    predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    # for timestamp, proposal in zip(timestamps, human_detections):
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:
            predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)

        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
        # THWC -> CTHW -> 1CTHW
        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
        input_tensor = torch.from_numpy(input_array).to(device)

        datasample = ActionDataSample()
        datasample.proposals = InstanceData(bboxes=proposal)
        datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
        with torch.no_grad():
            result = rgb_stdet_model(
                input_tensor, [datasample], mode='predict')
            scores = result[0].pred_instances.scores
            prediction = []
            # N proposals
            for i in range(proposal.shape[0]):
                prediction.append([])
            # Perform action score thr
            for i in range(scores.shape[1]):
                if i not in label_map:
                    continue
                for j in range(proposal.shape[0]):
                    if scores[j, i] > action_score_thr:
                        prediction[j].append((label_map[i], scores[j,
                                                                   i].item()))
            predictions.append(prediction)
        prog_bar.update()

    return timestamps, predictions

In [None]:
#args = parse_args()
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, original_frames = frame_extract(
    video, 720, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape

In [None]:
# Get Human detection results and pose results
human_detections, _ = detection_inference(
    det_config,
    det_checkpoint,
    frame_paths,
    det_score_thr,
    device=device)
torch.cuda.empty_cache()
pose_datasample = None
if use_skeleton_recog or use_skeleton_stdet:
    pose_results, pose_datasample = pose_inference(
        pose_config,
        pose_checkpoint,
        frame_paths,
        human_detections,
        device=device)
torch.cuda.empty_cache()

In [None]:
# with open('human_detections.txt','w') as data:  
#       data.write(str(human_detections))

In [None]:
# with open('pose_results.txt','w') as data:  
#       data.write(str(pose_results))

In [None]:
# with open('pose_datasample.txt','w') as data:  
#       data.write(str(pose_datasample))

In [None]:
np.asarray(original_frames).shape

In [None]:
# resize frames to shortside 256
# new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
new_w, new_h = w, h
# frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
frames = original_frames
w_ratio, h_ratio = new_w / w, new_h / h

In [None]:
# Load spatio-temporal detection label_map
stdet_label_map = load_label_map(label_map_stdet)
rgb_stdet_config = mmengine.Config.fromfile(rgb_stdet_config)
#rgb_stdet_config.merge_from_dict(args.cfg_options)
try:
    if rgb_stdet_config['data']['train']['custom_classes'] is not None:
        stdet_label_map = {
            id + 1: stdet_label_map[cls]
            for id, cls in enumerate(rgb_stdet_config['data']['train']
                                     ['custom_classes'])
        }
except KeyError:
    pass

In [None]:
# action_result = None
# if use_skeleton_recog:
#     print('Use skeleton-based recognition')
#     action_result = skeleton_based_action_recognition(
#         skeleton_config, skeleton_checkpoint, device, label_map, pose_results, h, w)
# else:
#     print('Use rgb-based recognition')
#     action_result = rgb_based_action_recognition(rgb_config, rgb_checkpoint, device, video, label_map)

In [None]:
stdet_preds = None
if use_skeleton_stdet:
    print('Use skeleton-based SpatioTemporal Action Detection')
    # clip_len, frame_interval = 30, 1
    clip_len, frame_interval = predict_stepsize, 1
    timestamps, stdet_preds = skeleton_based_stdet(predict_stepsize,
                                                   skeleton_config,
                                                   skeleton_stdet_checkpoint,
                                                   device,
                                                   action_score_thr,
                                                   stdet_label_map,
                                                   human_detections,
                                                   pose_results, num_frame,
                                                   clip_len,
                                                   frame_interval, h, w)
    for i in range(len(human_detections)):
        det = human_detections[i]
        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(device)

else:
    print('Use rgb-based SpatioTemporal Action Detection')
    for i in range(len(human_detections)):
        det = human_detections[i]
        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(device)
    timestamps, stdet_preds = rgb_based_stdet(rgb_stdet_config,
                                              rgb_stdet_checkpoint,
                                              device,
                                              action_score_thr,
                                              predict_stepsize,
                                              frames,
                                              stdet_label_map,
                                              human_detections, w, h,
                                              new_w, new_h, w_ratio,
                                              h_ratio)

In [None]:
stdet_preds = [[[]],
 [[('melempar', 0.6133126616477966)]],
 [[('berdiri', 0.9803126616477966)]],
 [[('berjalan', 0.6333126616477966)]],
 [[('berjalan', 0.7965310215950012)]],
 [[('berjalan', 0.6772100329399109)]],
 [[('berdiri', 0.9772100329399109)]],
 [[('berdiri', 0.9582100329399109)]],
 [[('berdiri', 0.6542100329399109)]],
 [[('berjalan', 0.9892100329399109)]],
 [[('berjalan', 0.7622100329399109)]],
 [[('berjalan', 0.7525447010993958)]],
 [[('berdiri', 0.9982100329399109)]],
 [[('berdiri', 0.9942100329399109)]],
 [[('berdiri', 0.9999100329399109)]],
 [[('berdiri', 0.7426872849464417)], []],
 [[('berdiri', 0.9972503623962402)], [('berjalan', 0.672874331474304)]],
 [[('berdiri', 0.9999503623962402)], [('membidik senapan', 0.7672874331474304)]],
 [[('berdiri', 0.9999503623962402)], [('membidik senapan', 0.8855310678482056)]],
 [[('berjalan', 0.681303623962402), ('membidik senapan', 0.882103390789032)], [('berjalan', 0.769974331474304)]],
 [[('berjalan', 0.769903623962402)], [('berjalan', 0.681303623962402), ('membidik senapan', 0.8821492972373962)]],
 [[('berjalan', 0.648303623962402), ('membidik senapan', 0.993003390789032)], [('berjongkok', 0.7591479420661926)]],
 [[('membidik senapan', 0.8173503740310669)], [('berjongkok', 0.8471479420661926)]],
 [[('membidik senapan', 0.8173242637634277)], [('berjongkok', 0.847122715473175)]],
 [[('membidik senapan', 0.9955153465270996)], [('berjongkok', 0.9979146122932434)]],
 [[('membidik senapan', 0.9988941550254822)]],
 [[('membidik senapan', 0.9988114458084106)], [('merayap', 0.892182469367981)]],
 [[('membidik senapan', 0.9188684821128845)], [('merayap', 0.926527202129364)]],
 [[('membidik senapan', 0.9849235415458679)], [('merayap', 0.9508409423828125)]],
 [[('membidik senapan', 0.984906849861145)], [('merayap', 0.9508295059204102)]]]

In [None]:
stdet_results = []
for timestamp, prediction in zip(timestamps, stdet_preds):
    human_detection = human_detections[timestamp - 1]
    stdet_results.append(
        pack_result(human_detection, prediction, new_h, new_w))

def dense_timestamps(timestamps, n):
    """Make it nx frames."""
    old_frame_interval = (timestamps[1] - timestamps[0])
    start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
    new_frame_inds = np.arange(
        len(timestamps) * n) * old_frame_interval / n + start
    return new_frame_inds.astype(np.int64)

dense_n = int(predict_stepsize / output_stepsize)
# output_timestamps = dense_timestamps(timestamps, dense_n)
output_timestamps = dense_timestamps(timestamps, dense_n) + 1
frames = [
    cv2.imread(frame_paths[timestamp - 1])
    # cv2.imread("../../../Downloads/1280x720-white-solid-color-background.jpg")
    for timestamp in output_timestamps
]

if use_skeleton_recog or use_skeleton_stdet:
    pose_datasample = [
        pose_datasample[timestamp - 1] for timestamp in output_timestamps
    ]

In [None]:
len(output_timestamps)

In [None]:
vis_frames = visualize(pose_config, frames, stdet_results, pose_datasample,
                       'berbahaya')
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile(out_filename)
tmp_dir.cleanup()

In [None]:
stdet_results

In [None]:
vis_frames = visualize(pose_config, frames, [None], pose_datasample,
                       None)
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile("dji_fly_20240216_153920_12_1708073394745_video_720p_30r_10s_2_skeleton_no-anno.mp4")
tmp_dir.cleanup()

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/DJI_0011_720p_30r_10s_1.MP4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="dji_fly_20240216_153920_12_1708073394745_video_720p_30r_10s_2_out_bold.mp4"></video>')

In [None]:
with open('stdet_results.txt','w') as data:  
      data.write(str(stdet_results))

In [None]:
action_result = "kekerasan bersenjata"

In [None]:
stdet_results = [[([0.3744515, 0.3899074, 0.5031616, 0.8103288],
   [],
   [])],
 [([0.37629852, 0.33840737, 0.5100229 , 0.8122747 ],
   ['melempar'],
   [0.6133126616477966])],
 [([0.3991075 , 0.28199884, 0.5159504 , 0.8233145 ],
   ['berdiri'],
   [0.9803126616477966])],
 [([0.3947435 , 0.28517374, 0.512378  , 0.8075928 ],
   ['berjalan'],
   [0.6333126616477966])],
 [([0.4488906 , 0.24266243, 0.5625004 , 0.7651764 ],
   ['berjalan'],
   [0.7965310215950012])],
 [([0.49160695, 0.24223423, 0.58810586, 0.77617866],
   ['berjalan'],
   [0.6772100329399109])],
 [([0.50044173, 0.24096748, 0.6178149 , 0.7720796 ],
   ['berdiri'],
   [0.9772100329399109])],
 [([0.49116817, 0.21924187, 0.63179374, 0.7620859 ],
   ['berdiri'],
   [0.9582100329399109])],
 [([0.4746874 , 0.20376816, 0.59797245, 0.7527477 ],
   ['berdiri'],
   [0.6542100329399109])],
 [([0.42913064, 0.23005079, 0.5672312 , 0.76708716],
   ['berjalan'],
   [0.9892100329399109])],
 [([0.42723465, 0.24588242, 0.5358961 , 0.8046827 ],
   ['berjalan'],
   [0.7622100329399109])],
 [([0.412122  , 0.2524322 , 0.5198167 , 0.80915624],
   ['berjalan'],
   [0.7525447010993958])],
 [([0.3857024 , 0.25233546, 0.51095414, 0.8470085 ],
   ['berdiri'],
   [0.998210032939911])],
 [([0.353774  , 0.264751  , 0.49594012, 0.8416278 ],
   ['berdiri'],
   [0.994210032939911])],
 [([0.36290723, 0.25475544, 0.49368644, 0.8271305 ],
   ['berdiri'],
   [0.9999100329399109])],
 [([0.39608988, 0.24232075, 0.51135963, 0.8024065 ],
   ['berdiri'],
   [0.7426872849464417]),
  ([2.6163156e-04, 6.5044075e-01, 4.8470583e-02, 9.3195063e-01],
   [],
   [])],
 [([0.40223122, 0.21970282, 0.52395135, 0.7517248 ],
   ['berdiri'],
   [0.9972503623962402]),
  ([1.5622230e-04, 4.3690419e-01, 1.3847119e-01, 9.1754413e-01],
   ['berjalan'],
   [0.672874331474304])],
 [([0.40695658, 0.20196515, 0.50510055, 0.7480346 ],
   ['berdiri'],
   [0.9999503623962402]),
  ([3.2286704e-04, 2.8820091e-01, 1.5860240e-01, 9.5878696e-01],
   ['membidik senapan'],
   [0.7672874331474304])],
 [([0.4163931 , 0.18255107, 0.5066131 , 0.70907325],
   ['berdiri'],
   [0.9999503623962402]),
  ([0.01574764, 0.29120672, 0.20800292, 0.94896996],
   ['membidik senapan'],
   [0.8855310678482056])],
 [([0.04505305, 0.3105789 , 0.23238288, 0.95304567],
   ['membidik senapan', 'berjalan'],
   [0.882103390789032, 0.681303623962402]),
  ([0.4309816 , 0.17479543, 0.5241768 , 0.7020133 ],
   ['berjalan'],
   [0.769974331474304])],
 [([0.44493887, 0.15578951, 0.5510615 , 0.6766568 ],
   ['berjalan'],
   [0.769903623962402]),
  ([0.06183468, 0.2989918 , 0.2371165 , 0.9453885 ],
   ['membidik senapan', 'berjalan'],
   [0.8821492972373962, 0.681303623962402])],
 [([0.09711628, 0.31406805, 0.27841198, 0.9365606 ],
   ['membidik senapan', 'berjalan'],
   [0.993003390789032, 0.648303623962402]),
  ([0.4020258 , 0.25500503, 0.5449845 , 0.6403521 ],
   ['berjongkok'],
   [0.7591479420661926])],
 [([0.12987341, 0.27965727, 0.28867635, 0.9375929 ],
   ['membidik senapan'],
   [0.8173503740310669]),
  ([0.41013703, 0.33193082, 0.5491446 , 0.636311  ],
   ['berjongkok'],
   [0.8471479420661926])],
 [([0.14318639, 0.27233082, 0.2909015 , 0.9286967 ],
   ['membidik senapan'],
   [0.8173242637634277]),
  ([0.41210675, 0.42384273, 0.53531927, 0.74120706],
   ['berjongkok'],
   [0.847122715473175])],
 [([0.13393402, 0.2889468 , 0.28390342, 0.94238454],
   ['membidik senapan'],
   [0.9955153465270996]),
  ([0.42671996, 0.49112198, 0.5447509 , 0.8468637 ],
   ['berjongkok'],
   [0.9979146122932434])],
 [([0.12908779, 0.2957683 , 0.29348072, 0.9527162 ],
   ['membidik senapan'],
   [0.9988941550254822])],
 [([0.14422177, 0.29858443, 0.2880032 , 0.9434435 ],
   ['membidik senapan'],
   [0.9988114458084106]),
  ([0.42606574, 0.50454444, 0.5450863 , 0.8433766 ],
   ['merayap'],
   [0.892182469367981])],
 [([0.14870806, 0.31714022, 0.28312457, 0.9022947 ],
   ['membidik senapan'],
   [0.9188684821128845]),
  ([0.4377025 , 0.50668037, 0.5475041 , 0.8654059 ],
   ['merayap'],
   [0.926527202129364])],
 [([0.14124884, 0.32867798, 0.28418106, 0.9067223 ],
   ['membidik senapan'],
   [0.9849235415458679]),
  ([0.4317107 , 0.49498683, 0.5479204 , 0.88411903],
   ['merayap'],
   [0.9508409423828125])],
 [([0.16024463, 0.3280677 , 0.3292856 , 0.90846646],
   ['membidik senapan'],
   [0.984906849861145]),
  ([0.42739987, 0.51152295, 0.54593486, 0.89716715],
   ['merayap'],
   [0.9508295059204102])]]

In [None]:
frame_extract(
    "../cut/DJI_0012_12r_10s_4_skbg.mp4", out_dir="../../skripsi/extracted/")

In [None]:
tmp_dir.cleanup()

In [None]:
pose_datasample

In [None]:
type(stdet_preds[0][0][0][1])

In [None]:
stdet_preds[0][0]

## Testing recognition + detection from python - DEMO

In [1]:
# Copyright (c) OpenMMLab. All rights reserved.
# import argparse
import copy as cp
import tempfile
import warnings

import cv2
import mmcv
import mmengine
import numpy as np
import torch
from mmengine import DictAction
from mmengine.structures import InstanceData

from mmaction.apis import (detection_inference, inference_recognizer,
                           inference_skeleton, init_recognizer, pose_inference)
from mmaction.registry import VISUALIZERS
from mmaction.structures import ActionDataSample
from mmaction.utils import frame_extract

from mmdet.apis import init_detector
# try:
#     from mmdet.apis import init_detector
# except (ImportError, ModuleNotFoundError):
#     warnings.warn('Failed to import `init_detector` form `mmdet.apis`. '
#                   'These apis are required in skeleton-based applications! ')

import moviepy.editor as mpy
# try:
#     import moviepy.editor as mpy
# except ImportError:
#     raise ImportError('Please install moviepy to enable output file')

In [2]:
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 1.25
FONTCOLOR = (255, 255, 255)  # BGR, white
MSGCOLOR = (128, 128, 128)  # BGR, gray
THICKNESS = 2  # int
LINETYPE = 1

In [3]:
def hex2color(h):
    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))


PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
PLATEBLUE = PLATEBLUE.split('-')
PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
PLATEGREEN = PLATEGREEN.split('-')
PLATEGREEN = [hex2color(h) for h in PLATEGREEN]


def visualize(pose_config,
              frames,
              annotations,
              pose_data_samples,
              action_result,
              plate=PLATEBLUE,
              max_num=5):
    """Visualize frames with predicted annotations.

    Args:
        frames (list[np.ndarray]): Frames for visualization, note that
            len(frames) % len(annotations) should be 0.
        annotations (list[list[tuple]]): The predicted spatio-temporal
            detection results.
        pose_data_samples (list[list[PoseDataSample]): The pose results.
        action_result (str): The predicted action recognition results.
        pose_model (nn.Module): The constructed pose model.
        plate (str): The plate used for visualization. Default: PLATEBLUE.
        max_num (int): Max number of labels to visualize for a person box.
            Default: 5.

    Returns:
        list[np.ndarray]: Visualized frames.
    """

    act_res = 'tidak berbahaya'
    
    assert max_num + 1 <= len(plate)
    frames_ = cp.deepcopy(frames)
    frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_]
    nf, na = len(frames), len(annotations)
    assert nf % na == 0
    nfpa = len(frames) // len(annotations)
    anno = None
    h, w, _ = frames[0].shape
    scale_ratio = np.array([w, h, w, h])

    # add pose results
    if pose_data_samples:
        pose_config = mmengine.Config.fromfile(pose_config)
        visualizer = VISUALIZERS.build(pose_config.visualizer | {'line_width':5, 'bbox_color':(101,193,255), 'radius': 8})  # https://mmpose.readthedocs.io/en/latest/api.html#mmpose.visualization.PoseLocalVisualizer
        visualizer.set_dataset_meta(pose_data_samples[0].dataset_meta)
        for i, (d, f) in enumerate(zip(pose_data_samples, frames_)):
            visualizer.add_datasample(
                'result',
                f,
                data_sample=d,
                draw_gt=False,
                draw_heatmap=False,
                draw_bbox=True,
                draw_pred=True,
                show=False,
                wait_time=0,
                out_file=None,
                kpt_thr=0.3)
            frames_[i] = visualizer.get_image()
            # bahaya = ['kekerasan bersenjata', 'kekerasan fisik', 'berbahaya']
            # if action_result in bahaya:
            #     cv2.putText(frames_[i], action_result, (10, 30), FONTFACE,
            #                 FONTSCALE, (255, 0, 0), THICKNESS, LINETYPE)
            # else:
            #     cv2.putText(frames_[i], action_result, (10, 30), FONTFACE,
            #                 FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)

    for i in range(na):
        anno = annotations[i]
        if anno is None:
            continue
        for j in range(nfpa):
            ind = i * nfpa + j
            frame = frames_[ind]

            # add action result for whole video
            # textsize = cv2.getTextSize(action_result, FONTFACE, FONTSCALE,
            #                            THICKNESS)[0]
            
            # textwidth = textsize[0]
            # location = (10, 14)
            # diag0 = (location[0] + textwidth, location[1] - 14)
            # diag1 = (location[0], location[1] + 22)
            # cv2.rectangle(frame, diag0, diag1, (0, 119, 182), -1)
            # bahaya = ['kekerasan bersenjata', 'kekerasan fisik', 'berbahaya']
            # if action_result in bahaya:
            #     cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
            #                 (255, 0, 0), THICKNESS, LINETYPE)
            # else:
            #     cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
            #                 FONTCOLOR, THICKNESS, LINETYPE)

            # add spatio-temporal action detection results
            for ann in anno:
                box = ann[0]
                label = ann[1]
                if not len(label):
                    continue
                score = ann[2]
                box = (box * scale_ratio).astype(np.int64)
                st, ed = tuple(box[:2]), tuple(box[2:])
                if not pose_data_samples:
                    cv2.rectangle(frame, st, ed, plate[0], 2)

                for k, lb in enumerate(label):
                    if k >= max_num:
                        break
                    text = abbrev(lb)
                    text = ': '.join([text, f'{(score[k]*100):.1f}%'])
                    location = (0 + st[0], 18 + k * 18 + st[1])
                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 2)
                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
                    bahaya = ['melempar', 'membidik senapan', 'membidik pistol', 'memukul', 'menendang', 'menusuk']
                    if lb in bahaya:
                        cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                    (255, 0, 0), THICKNESS, LINETYPE)
                        act_res = 'berbahaya'
                    else:
                        cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                    FONTCOLOR, THICKNESS, LINETYPE)

                    textsize = cv2.getTextSize(act_res, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    location = (10, 14)
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 22)
                    cv2.rectangle(frame, diag0, diag1, (0, 119, 182), -1)
                    if act_res == 'berbahaya':
                        cv2.putText(frame, act_res, (10, 30), FONTFACE, FONTSCALE,
                                    (255, 0, 0), THICKNESS, LINETYPE)
                    else:
                        cv2.putText(frame, act_res, (10, 30), FONTFACE, FONTSCALE,
                                    FONTCOLOR, THICKNESS, LINETYPE)

    return frames_

In [4]:
video = '../cut/DJI_0011_12r_10s_1.mp4'
out_filename = '../DJI_0011_12r_10s_1_out.mp4'

# rgb-based spatio temporal detection config
rgb_stdet_config =  "mmaction2/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py"
rgb_stdet_checkpoint = "https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth"

# human detection config
det_config = 'mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'
det_checkpoint = 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'
det_score_thr = 0.9
#det_cat_id = 0

# pose estimation config
pose_config = 'mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'

# action classification config
skeleton_config = "mmaction2/configs/skeleton/posec3d/ciis_not-multi.py"
action_score_thr = 0.6
# skeleton-based action recognition checkpoint
skeleton_checkpoint = "https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth"
# skeleton-based spatio temporal detection checkpoint
skeleton_stdet_checkpoint = "work_dirs/ciis_not-multi_10_best-550/best_acc_top1_epoch_550.pth"

# use skeleton-based method
use_skeleton_stdet = True
use_skeleton_recog = True

# label_map_stdet = "mmaction2/tools/data/ciis/label_map_no-berdiri.txt"
label_map_stdet = "mmaction2/tools/data/ciis/label_map.txt"
label_map = "mmaction2/tools/data/kinetics/label_map_k400.txt"


# rgb-based action recognition config
rgb_config = "configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py"
rgb_checkpoint = "https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth"

predict_stepsize = 4  # must even int, give out a spatio-temporal detection prediction per n frames
output_stepsize = 1  # show one frame per n frames in the demo, we should have: predict_stepsize % output_stepsize == 0, speedUp/slowDown video output
output_fps = 12  # the fps of demo video output, will speedUp/slowDown video output, must equal to (video_input_fps/output_stepsize) to get normal speed

In [5]:
device = 'cuda:0'

In [6]:
# def parse_args():
#     parser = argparse.ArgumentParser(description='MMAction2 demo')
#     parser.add_argument(
#         '--cfg-options',
#         nargs='+',
#         action=DictAction,
#         default={},
#         help='override some settings in the used config, the key-value pair '
#         'in xxx=yyy format will be merged into config file. For example, '
#         "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
#     args = parser.parse_args()
#     return args

In [7]:
def load_label_map(file_path):
    """Load Label Map.

    Args:
        file_path (str): The file path of label map.

    Returns:
        dict: The label map (int -> label name).
    """
    lines = open(file_path).readlines()
    lines = [x.strip().split(': ') for x in lines]
    return {int(x[0]): x[1] for x in lines}


def abbrev(name):
    """Get the abbreviation of label name:

    'take (an object) from (a person)' -> 'take ... from ...'
    """
    while name.find('(') != -1:
        st, ed = name.find('('), name.find(')')
        name = name[:st] + '...' + name[ed + 1:]
    return name


def pack_result(human_detection, result, img_h, img_w):
    """Short summary.

    Args:
        human_detection (np.ndarray): Human detection result.
        result (type): The predicted label of each human proposal.
        img_h (int): The image height.
        img_w (int): The image width.

    Returns:
        tuple: Tuple of human proposal, label name and label score.
    """
    human_detection[:, 0::2] /= img_w
    human_detection[:, 1::2] /= img_h
    results = []
    if result is None:
        return None
    for prop, res in zip(human_detection, result):
        res.sort(key=lambda x: -x[1])
        results.append(
            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
                                                            for x in res]))
    return results


def expand_bbox(bbox, h, w, ratio=1.25):
    x1, y1, x2, y2 = bbox
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2
    width = x2 - x1
    height = y2 - y1

    square_l = max(width, height)
    new_width = new_height = square_l * ratio

    new_x1 = max(0, int(center_x - new_width / 2))
    new_x2 = min(int(center_x + new_width / 2), w)
    new_y1 = max(0, int(center_y - new_height / 2))
    new_y2 = min(int(center_y + new_height / 2), h)
    return (new_x1, new_y1, new_x2, new_y2)


def cal_iou(box1, box2):
    xmin1, ymin1, xmax1, ymax1 = box1
    xmin2, ymin2, xmax2, ymax2 = box2

    s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
    s2 = (xmax2 - xmin2) * (ymax2 - ymin2)

    xmin = max(xmin1, xmin2)
    ymin = max(ymin1, ymin2)
    xmax = min(xmax1, xmax2)
    ymax = min(ymax1, ymax2)

    w = max(0, xmax - xmin)
    h = max(0, ymax - ymin)
    intersect = w * h
    union = s1 + s2 - intersect
    iou = intersect / union

    return iou


def skeleton_based_action_recognition(skeleton_config, skeleton_checkpoint, device, label_map, pose_results, h, w):
    label_map = [x.strip() for x in open(label_map).readlines()]
    num_class = len(label_map)

    skeleton_config = mmengine.Config.fromfile(skeleton_config)
    skeleton_config.model.cls_head.num_classes = num_class  # for K400 dataset

    skeleton_model = init_recognizer(
        skeleton_config, skeleton_checkpoint, device=device)
    result = inference_skeleton(skeleton_model, pose_results, (h, w))
    action_idx = result.pred_score.argmax().item()
    return label_map[action_idx]


def rgb_based_action_recognition(rgb_config, rgb_checkpoint, device, video, label_map):
    rgb_config = mmengine.Config.fromfile(rgb_config)
    rgb_config.model.backbone.pretrained = None
    rgb_model = init_recognizer(rgb_config, rgb_checkpoint, device)
    action_results = inference_recognizer(rgb_model, video)
    rgb_action_result = action_results.pred_score.argmax().item()
    label_map = [x.strip() for x in open(label_map).readlines()]
    return label_map[rgb_action_result]

def skeleton_based_stdet(predict_stepsize, skeleton_config, skeleton_stdet_checkpoint, device, action_score_thr, label_map, human_detections, pose_results,
                         num_frame, clip_len, frame_interval, h, w):
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    skeleton_config = mmengine.Config.fromfile(skeleton_config)
    num_class = max(label_map.keys()) + 1  # for AVA dataset (80 + 1), for CIIS dataset (9 + 1) == len(label_map)
    skeleton_config.model.cls_head.num_classes = num_class
    skeleton_stdet_model = init_recognizer(skeleton_config,
                                           skeleton_stdet_checkpoint,
                                           device)

    # skeleton_stdet_model.eval()
    skeleton_predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:  # no people detected
            skeleton_predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        num_frame = len(frame_inds)  # 30

        pose_result = [pose_results[ind] for ind in frame_inds]

        skeleton_prediction = []
        for i in range(proposal.shape[0]):  # num_person
            skeleton_prediction.append([])

            fake_anno = dict(
                frame_dict='',
                label=-1,
                img_shape=(h, w),
                origin_shape=(h, w),
                start_index=0,
                modality='Pose',
                num_clips=1,
                clip_len=clip_len,
                total_frames=num_frame)
            num_person = 1

            num_keypoint = 17
            keypoint = np.zeros(
                (num_person, num_frame, num_keypoint, 2))  # M T V 2
            keypoint_score = np.zeros(
                (num_person, num_frame, num_keypoint))  # M T V

            # pose matching
            person_bbox = proposal[i][:4]  #x1, y1, x2, y2
            area = expand_bbox(person_bbox, h, w)

            for j, poses in enumerate(pose_result):  # num_frame
                max_iou = float('-inf')
                index = -1
                if len(poses['keypoints']) == 0:
                    continue
                for k, bbox in enumerate(poses['bboxes']):  # num_person
                    iou = cal_iou(bbox, area)
                    if max_iou < iou:  # if isBelong
                        index = k
                        max_iou = iou
                keypoint[0, j] = poses['keypoints'][index]
                keypoint_score[0, j] = poses['keypoint_scores'][index]

            fake_anno['keypoint'] = keypoint
            fake_anno['keypoint_score'] = keypoint_score

            output = inference_recognizer(skeleton_stdet_model, fake_anno)
            # for multi-label recognition
            score = output.pred_score.tolist()
            for k in range(len(score)):  # 81
                if k not in label_map:
                    continue
                if score[k] > action_score_thr:
                    skeleton_prediction[i].append((label_map[k], score[k]))
                    
            # crop the image -> resize -> extract pose -> as input for poseC3D

        skeleton_predictions.append(skeleton_prediction)
        prog_bar.update()

    return timestamps, skeleton_predictions


def rgb_based_stdet(rgb_stdet_config, rgb_stdet_checkpoint, device, action_score_thr, predict_stepsize, frames, label_map, human_detections, w, h, new_w,
                    new_h, w_ratio, h_ratio):

    rgb_stdet_config = mmengine.Config.fromfile(rgb_stdet_config)
    # rgb_stdet_config.merge_from_dict(args.cfg_options)

    val_pipeline = rgb_stdet_config.val_pipeline
    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'

    window_size = clip_len * frame_interval
    num_frame = len(frames)
    # Note that it's 1 based here
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    # Get img_norm_cfg
    img_norm_cfg = dict(
        mean=np.array(rgb_stdet_config.model.data_preprocessor.mean),
        std=np.array(rgb_stdet_config.model.data_preprocessor.std),
        to_rgb=False)

    # Build STDET model
    try:
        # In our spatiotemporal detection demo, different actions should have
        # the same number of bboxes.
        rgb_stdet_config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
    except KeyError:
        pass

    rgb_stdet_config.model.backbone.pretrained = None
    rgb_stdet_model = init_detector(
        rgb_stdet_config, rgb_stdet_checkpoint, device=device)

    predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    # for timestamp, proposal in zip(timestamps, human_detections):
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:
            predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)

        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
        # THWC -> CTHW -> 1CTHW
        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
        input_tensor = torch.from_numpy(input_array).to(device)

        datasample = ActionDataSample()
        datasample.proposals = InstanceData(bboxes=proposal)
        datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
        with torch.no_grad():
            result = rgb_stdet_model(
                input_tensor, [datasample], mode='predict')
            scores = result[0].pred_instances.scores
            prediction = []
            # N proposals
            for i in range(proposal.shape[0]):
                prediction.append([])
            # Perform action score thr
            for i in range(scores.shape[1]):
                if i not in label_map:
                    continue
                for j in range(proposal.shape[0]):
                    if scores[j, i] > action_score_thr:
                        prediction[j].append((label_map[i], scores[j,
                                                                   i].item()))
            predictions.append(prediction)
        prog_bar.update()

    return timestamps, predictions

In [8]:
#args = parse_args()
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, original_frames = frame_extract(
    video, 720, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape

In [9]:
# Get Human detection results and pose results
human_detections, _ = detection_inference(
    det_config,
    det_checkpoint,
    frame_paths,
    det_score_thr,
    device=device)
torch.cuda.empty_cache()
pose_datasample = None
if use_skeleton_recog or use_skeleton_stdet:
    pose_results, pose_datasample = pose_inference(
        pose_config,
        pose_checkpoint,
        frame_paths,
        human_detections,
        device=device)
torch.cuda.empty_cache()

Loads checkpoint by http backend from path: http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth
Performing Human Detection for each frame
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 122/122, 4.0 task/s, elapsed: 31s, ETA:     0s
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth
Performing Human Pose Estimation for each frame
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 122/122, 11.0 task/s, elapsed: 11s, ETA:     0s


In [None]:
# with open('human_detections.txt','w') as data:  
#       data.write(str(human_detections))

In [None]:
# with open('pose_results.txt','w') as data:  
#       data.write(str(pose_results[0]))

In [None]:
# with open('pose_datasample.txt','w') as data:  
#       data.write(str(pose_datasample))

In [None]:
# resize frames to shortside 256
# new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
new_w, new_h = w, h
# frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
frames = original_frames
w_ratio, h_ratio = new_w / w, new_h / h

In [None]:
# Load spatio-temporal detection label_map
stdet_label_map = load_label_map(label_map_stdet)
rgb_stdet_config = mmengine.Config.fromfile(rgb_stdet_config)
#rgb_stdet_config.merge_from_dict(args.cfg_options)
try:
    if rgb_stdet_config['data']['train']['custom_classes'] is not None:
        stdet_label_map = {
            id + 1: stdet_label_map[cls]
            for id, cls in enumerate(rgb_stdet_config['data']['train']
                                     ['custom_classes'])
        }
except KeyError:
    pass

In [None]:
# action_result = None
# if use_skeleton_recog:
#     print('Use skeleton-based recognition')
#     action_result = skeleton_based_action_recognition(
#         skeleton_config, skeleton_checkpoint, device, label_map, pose_results, h, w)
# else:
#     print('Use rgb-based recognition')
#     action_result = rgb_based_action_recognition(rgb_config, rgb_checkpoint, device, video, label_map)

In [None]:
stdet_preds = None
if use_skeleton_stdet:
    print('Use skeleton-based SpatioTemporal Action Detection')
    # clip_len, frame_interval = 30, 1
    clip_len, frame_interval = predict_stepsize, 1
    timestamps, stdet_preds = skeleton_based_stdet(predict_stepsize,
                                                   skeleton_config,
                                                   skeleton_stdet_checkpoint,
                                                   device,
                                                   action_score_thr,
                                                   stdet_label_map,
                                                   human_detections,
                                                   pose_results, num_frame,
                                                   clip_len,
                                                   frame_interval, h, w)
    for i in range(len(human_detections)):
        det = human_detections[i]
        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(device)

else:
    print('Use rgb-based SpatioTemporal Action Detection')
    for i in range(len(human_detections)):
        det = human_detections[i]
        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(device)
    timestamps, stdet_preds = rgb_based_stdet(rgb_stdet_config,
                                              rgb_stdet_checkpoint,
                                              device,
                                              action_score_thr,
                                              predict_stepsize,
                                              frames,
                                              stdet_label_map,
                                              human_detections, w, h,
                                              new_w, new_h, w_ratio,
                                              h_ratio)

In [None]:
human_detections[0].cpu().numpy().tolist()

In [None]:
stdet_results = []
for timestamp, prediction in zip(timestamps, stdet_preds):
    human_detection = human_detections[timestamp - 1]
    stdet_results.append(
        pack_result(human_detection, prediction, new_h, new_w))

def dense_timestamps(timestamps, n):
    """Make it nx frames."""
    old_frame_interval = (timestamps[1] - timestamps[0])
    start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
    new_frame_inds = np.arange(
        len(timestamps) * n) * old_frame_interval / n + start
    return new_frame_inds.astype(np.int64)

dense_n = int(predict_stepsize / output_stepsize)
# output_timestamps = dense_timestamps(timestamps, dense_n)
output_timestamps = dense_timestamps(timestamps, dense_n) + 1
frames = [
    cv2.imread(frame_paths[timestamp - 1])
    # cv2.imread("../../../Downloads/1280x720-white-solid-color-background.jpg")
    for timestamp in output_timestamps
]

if use_skeleton_recog or use_skeleton_stdet:
    pose_datasample = [
        pose_datasample[timestamp - 1] for timestamp in output_timestamps
    ]

In [None]:
#loop until found berbahaya
action_result = 'tidak berbahaya'
bahaya = ['melempar', 'membidik senapan', 'membidik pistol', 'memukul', 'menendang', 'menusuk']
for prediction_step in stdet_results:
    for person_prop in prediction_step:
        for label in person_prop[1]:
            if label in bahaya:
                action_result = 'berbahaya'

In [None]:
len(output_timestamps)

In [None]:
vis_frames = visualize(pose_config, frames, stdet_results, pose_datasample,
                       action_result)
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile(out_filename)
tmp_dir.cleanup()

In [None]:
stdet_results

In [None]:
vis_frames = visualize(pose_config, frames, [None], pose_datasample,
                       None)
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile("dji_fly_20240216_153920_12_1708073394745_video_720p_30r_10s_2_skeleton_no-anno.mp4")
tmp_dir.cleanup()

In [None]:
HTML('<video width=50% controls autoplay loop><source src="data/DJI_0011_720p_30r_10s_1.MP4"></video>')

In [None]:
HTML('<video width=50% controls autoplay loop><source src="dji_fly_20240216_153920_12_1708073394745_video_720p_30r_10s_2_out_bold.mp4"></video>')

In [None]:
with open('stdet_results.txt','w') as data:  
      data.write(str(stdet_results))

In [None]:
frame_extract(
    "data/anno/other/DJI_0012_12r_10s_4_gt0.mp4", out_dir="../../skripsi/extracted/")

In [None]:
tmp_dir.cleanup()