In [None]:
from IPython.display import HTML

## Testing recognition + detection from python - THERMAL

In [1]:
import sys
# Set paths for CenterNet files
CENTERNET_LIB_PATH = "../Thermal/CenterNet-ThermalPose-master/CenterNet/src/lib/"
CENTERNET_SRC_PATH = "../Thermal/CenterNet-ThermalPose-master/CenterNet/src/"
sys.path.insert(0, CENTERNET_LIB_PATH)
sys.path.insert(1, CENTERNET_SRC_PATH)

# Import libraries
import os
import cv2
import numpy as np
import pandas as pd
from opts import opts
from detectors.detector_factory import detector_factory

import moviepy.editor as mpy

# Supported image extensions
image_ext = ['jpg', 'jpeg', 'png', 'webp']
# Supported video extensions
video_ext = ['mp4', 'mov', 'avi', 'mkv']
# Name of time stats prints
time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']

# Colors for located keypoints
colors_hp = [(255, 0, 255), (255, 0, 0), (0, 0, 255), 
			(255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
			(255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
			(255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
			(255, 0, 0), (0, 0, 255)]

# Colors for skeleton
ec = [(255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255), 
		(255, 0, 0), (0, 0, 255), (255, 0, 255),
		(255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255),
		(255, 0, 0), (0, 0, 255), (255, 0, 255),
		(255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255)]

# Links between keypoints, for skeleton construction
edges = [[0, 1], [0, 2], [1, 3], [2, 4], 
		[3, 5], [4, 6], [5, 6], 
		[5, 7], [7, 9], [6, 8], [8, 10], 
		[5, 11], [6, 12], [11, 12], 
		[11, 13], [13, 15], [12, 14], [14, 16]]

In [2]:
def add_coco_bbox(img,bbox, conf=1, show_txt=True):
	"""
	draws bounding box over img
	-----
	Params
	-----
	img: np.array
		input image
	bbox: list
		bounding box coordinates
	conf: float
		confidence in detection
	show_txt: bool
		show text with confidence score and category over bbox
	------
	Returns
		Image with bounding box drawn over
	"""
	bbox = np.array(bbox, dtype=np.int32)
	c = [0,255,0] # bbox color
	txt = '{}{:.1f}'.format("person", conf) # text to display
	font = cv2.FONT_HERSHEY_SIMPLEX
	cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
	# Creates rectangle over image
	cv2.rectangle(
		img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), c, 5)
	# Draws text over image, if requested
	if show_txt:
	  cv2.rectangle(img,
	                (bbox[0], bbox[1] - cat_size[1] - 2),
	                (bbox[0] + cat_size[0], bbox[1] - 2), c, -1)
	  cv2.putText(img, txt, (bbox[0], bbox[1] - 2), 
	              font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)
	return img

def add_coco_hp(img,points): 
	"""
	draws detected keypoints and skeleton over input image
	-----
	Params
	-----
	img: np.array
		input image
	points: list
		keypoint coordinates
	------
	Returns
		Image with keypoints and skeletons drawn over
	"""
	points = np.array(points, dtype=np.int32).reshape(17, 2)
	# Draws each keypoint over image
	for j in range(17):
		cv2.circle(img,
				(points[j, 0], points[j, 1]), 8, colors_hp[j], -1)
	# Draws lines joining keypoints, to form skeletons
	for j, e in enumerate(edges):
		if points[e].min() > 0:
			cv2.line(img, (points[e[0], 0], points[e[0], 1]),
			(points[e[1], 0], points[e[1], 1]), ec[j], 5,
			lineType=cv2.LINE_AA)
	return img

def draw_detection(img,results,min_confidence):
	"""
	draws detected bounding box and keypoints over image
	-----
	Params
	-----
	img: np.array
		input image
	results: dict
		dictionary with keypoint and bounding box detections
	min_confidence: float
		minimum confidence in detection for displaying
	------
	Returns
		Image with keypoints and bounding boxes drawn over
	"""
	for bbox in results[1]:
		# Verifies if detection is over threshold
		if bbox[4] > min_confidence:
			ret_img = add_coco_bbox(img,bbox[:4], bbox[4]) # draws bbox
			ret_img = add_coco_hp(ret_img,bbox[5:39])         # draws kpts
			# ret_img = add_coco_hp(img,bbox[5:39])         # draws kpts
		# If detection is not over confidence threshold, returns original image
		else:
			ret_img = img
	return ret_img

def org_detections(results,min_confidence):
	"""
	returns detection results as an structured dataframe
	-----
	Params
	-----
	results: dict
		dictionary with keypoint and bounding box detections
	min_confidence: float
		minimum confidence in detection for displaying
	------
	Returns
		Dataframe with bounding box coordinates, score and keypoint locations
	"""
	columns = ["topleft_bbox","botright_bbox","score","nose","left_eye","right_eye","left_ear","right_ear","left_shoulder",
			   "right_shoulder","left_elbow","right_elbow","left_wrist","right_wrist","left_hip","right_hip","left_knee",
			   "right_knee","left_ankle","right_ankle"]
	df = pd.DataFrame(columns=columns)
	det_idx = 0
	for bbox in results[1]:
		# Only saves detections over threshold confidence
		if bbox[4] > min_confidence:
			# Boundix box coordinates
			topleft_bbox = [(bbox[0],bbox[1])]
			botright_bbox = [(bbox[2],bbox[3])]
			# Detection score
			score = [bbox[4]]
			# Keypoints coordinates
			x_kpts = bbox[5:39:2]
			y_kpts = bbox[6:39:2]
			xy_kpts = list(zip(x_kpts,y_kpts))
			det = topleft_bbox + botright_bbox + score + xy_kpts
			# Appends detection info to dataframe
			df.loc[det_idx] = det
			det_idx += 1
	return df

In [3]:
demo = '../0516_10s_12r.mp4'  # path to image/ image folders/ video.
pause = False  # whether to pause between detections
arch = 'dla'  # model architecture. Currently tested || dla | hourglass | hrnet ||
min_confidence = 0.3  # minimum confidence for visualization
show_fps = False  # show fps of detection in visualization
output_dir = '../Thermal/CenterNet-ThermalPose-master/osd/'  # output directory for detections
save_img = True  # store images with detections
save_csv = False  # save csv files with detected joints and bboxes
visualize = 0  # wheter to visualize outputs
input_fps = 30

In [4]:
# Selects appropiate paths according to backbone selection
if arch == 'dla':
    # MODEL_PATH = "../CenterNet/models/multi_pose_dla_3x_gray_384_0frz.pth"
    MODEL_PATH = "../Thermal/CenterNet-ThermalPose-master/CenterNet/models/multi_pose_dla_3x_gray_384_0frz.pth"
    arch_name = 'dla_34'
elif arch == 'hourglass':
    # MODEL_PATH = "../CenterNet/models/multi_pose_hg_3x_gray_0frz.pth"
    MODEL_PATH = "../Thermal/CenterNet-ThermalPose-master/CenterNet/models/multi_pose_hg_3x_gray_0frz.pth"
    arch_name = arch
elif arch =='hrnet':
    # MODEL_PATH = "../CenterNet/models/multi_pose_hrnet_3x_gray_finetune"
    MODEL_PATH = "../Thermal/CenterNet-ThermalPose-master/CenterNet/models/multi_pose_hrnet_3x_gray_finetune"
    arch_name = 'hrnet32'
# Initializes centernet options
opt = opts().init('{} --load_model {} --arch {}'.format('multi_pose', MODEL_PATH,arch_name).split(' '))
# Creates detector
detector = detector_factory[opt.task](opt)
detector.pause = False


if demo == 'webcam' or \
    demo[demo.rfind('.') + 1:].lower() in video_ext:
    # Initializes video capture for frame retrieval
    cam = cv2.VideoCapture(0 if demo == 'webcam' else demo)
    # In case output directory is specified, creates video writer for saving each frame into output video
    if output_dir != '' and save_img:
        output_video_path = '{}{}_{}.mp4'.format(output_dir,demo.split("/")[-1].split(".")[0],arch)
        # _, sample_img = cam.read()
        # out = cv2.VideoWriter(output_video_path,cv2.VideoWriter_fourcc('M','J','P','G'), 16, 
        #       (sample_img.shape[0],sample_img.shape[1]))
        # out = cv2.VideoWriter(output_video_path,cv2.VideoWriter_fourcc(*'X264'), 16, 
        #       (sample_img.shape[0],sample_img.shape[1]))
    frame_idx = 0
    imgs = []
    annos = []
    while True:
        _, img = cam.read()        # reads frame from webcam or video
        frame_idx += 1
        if type(img) == type(None):
          break
        # print(type(img))
        # print(img.shape)
        ret = detector.run(img)    # runs detection over input image
        annos.append([anno for anno in ret["results"][1] if anno[4] > min_confidence])  # for ciis action classification
        ret_img = draw_detection(img.copy(),ret["results"],min_confidence) # draws detection over input image
        # If user wants, fps can be shown over detection image
        if show_fps:
            cv2.putText(ret_img,'fps: {:.2f}'.format((1/ret['tot'])),(0,30), cv2.FONT_HERSHEY_SIMPLEX , 1, (255, 255, 255), 2, cv2.LINE_AA)
        if visualize:
            cv2.imshow('entrada', img) # shows input image
            cv2.imshow('deteccion',ret_img) # shows image with detections
        # Writes frame with detection over output video
        if output_dir != '':
            if save_img:
                # out.write(ret_img)
                imgs.append(ret_img)
            if save_csv:
                df_det = org_detections(ret["results"],min_confidence)
                df_det.to_csv("{}{}-{}_{}.csv".format(output_dir,demo.split("/")[-1].split(".")[0],frame_idx,arch))
        # Prints time stats
        time_str = ''
        for stat in time_stats:
            time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
        print(time_str)
        # Option for exiting program
        if cv2.waitKey(0 if pause else 1) == 27:
            cam.release()
            # out.release()
            import sys
            sys.exit(0)
    vid = mpy.ImageSequenceClip(imgs, fps=input_fps)
    vid.write_videofile(output_video_path, remove_temp=True)
    cam.release()
    # out.release()

else:
    # If demo is image or directory with images, retrieves path for each one of them
    if os.path.isdir(demo):
        image_names = []
        ls = os.listdir(demo)
        print(demo)
        for file_name in sorted(ls):
            ext = file_name[file_name.rfind('.') + 1:].lower()
            if ext in image_ext:
                image_names.append(os.path.join(demo, file_name))
    else:
        image_names = [demo]

    for (image_name) in image_names:
        # Reads image
        img = cv2.imread(image_name)
        ret = detector.run(img)    # runs detection over image
        annos.append([anno for anno in ret["results"][1] if anno[4] > min_confidence])  # for ciis action classification
        ret_img = draw_detection(img.copy(),ret["results"],min_confidence) # draws detections over image
        if visualize:
            cv2.imshow('entrada', img) # shows input image
            cv2.imshow('deteccion',ret_img) # shows image with detections

        # saves output image with detections, if requested
        if output_dir != '':
            if save_img:
                output_img_path = '{}{}_{}.png'.format(output_dir,image_name.split("/")[-1].split(".")[0],arch)
                cv2.imwrite(output_img_path,ret_img)
            if save_csv:
                df_det = org_detections(ret["results"],min_confidence)
                df_det.to_csv("{}{}_{}.csv".format(output_dir,image_name.split("/")[-1].split(".")[0],arch))
        # Option for exiting program
        if cv2.waitKey(0 if pause else 1) == 27:
            import sys
            sys.exit(0)
        # Prints time stats
        time_str = ''
        for stat in time_stats:
            time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
        print(time_str)

Fix size testing.
training chunk_sizes: [32]
The output will be saved to  /home/aldy/Documents/skripsi/posec3d-video-structuralize/../Thermal/CenterNet-ThermalPose-master/CenterNet/src/lib/../../exp/multi_pose/default
heads {'hm': 1, 'wh': 2, 'hps': 34, 'reg': 2, 'hm_hp': 17, 'hp_offset': 2}
Creating model...
loaded ../Thermal/CenterNet-ThermalPose-master/CenterNet/models/multi_pose_dla_3x_gray_384_0frz.pth, epoch 50
tot 0.288s |load 0.000s |pre 0.012s |net 0.181s |dec 0.089s |post 0.007s |merge 0.000s |
tot 0.073s |load 0.000s |pre 0.010s |net 0.055s |dec 0.002s |post 0.006s |merge 0.000s |
tot 0.072s |load 0.000s |pre 0.009s |net 0.055s |dec 0.002s |post 0.006s |merge 0.000s |
tot 0.071s |load 0.000s |pre 0.009s |net 0.053s |dec 0.002s |post 0.007s |merge 0.000s |
tot 0.065s |load 0.000s |pre 0.010s |net 0.046s |dec 0.002s |post 0.006s |merge 0.000s |
tot 0.063s |load 0.000s |pre 0.009s |net 0.045s |dec 0.002s |post 0.007s |merge 0.000s |
tot 0.063s |load 0.000s |pre 0.010s |net 0.04

                                                                                                     

Moviepy - Done !
Moviepy - video ready ../Thermal/CenterNet-ThermalPose-master/osd/0516_10s_12r_dla.mp4


In [5]:
human_detections = [np.array([det[:4] for det in anno], dtype=np.float32) for anno in annos]

In [6]:
pose_results = []
for anno in annos:
    pose = dict(
        keypoints=[[list(xy_kpt) for xy_kpt in zip(det[5:39:2], det[6:39:2])] for det in anno],
        keypoint_scores=[[0.8808185 , 0.8933128 , 0.8556601 , 0.9248522 , 0.9113513 ,
                          0.83777976, 0.80287457, 0.89377964, 0.66614443, 0.8510387 ,
                          0.70986986, 0.70710015, 0.64390606, 0.6184495 , 0.519892  ,
                          0.55833685, 0.46859613] for det in anno],
        bboxes=[det[:4] for det in anno]
    )
    pose_results.append(pose)

In [7]:
# Copyright (c) OpenMMLab. All rights reserved.
# import argparse
import copy as cp
import tempfile
import warnings

import cv2
import mmcv
import mmengine
import numpy as np
import torch
from mmengine import DictAction
from mmengine.structures import InstanceData

from mmaction.apis import (detection_inference, inference_recognizer,
                           inference_skeleton, init_recognizer, pose_inference)
from mmaction.registry import VISUALIZERS
from mmaction.structures import ActionDataSample
from mmaction.utils import frame_extract

from mmdet.apis import init_detector
# try:
#     from mmdet.apis import init_detector
# except (ImportError, ModuleNotFoundError):
#     warnings.warn('Failed to import `init_detector` form `mmdet.apis`. '
#                   'These apis are required in skeleton-based applications! ')

import moviepy.editor as mpy
# try:
#     import moviepy.editor as mpy
# except ImportError:
#     raise ImportError('Please install moviepy to enable output file')

In [8]:
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 1.25
FONTCOLOR = (255, 255, 255)  # BGR, white
MSGCOLOR = (128, 128, 128)  # BGR, gray
THICKNESS = 2  # int
LINETYPE = 1

In [9]:
def hex2color(h):
    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))


PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
PLATEBLUE = PLATEBLUE.split('-')
PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
PLATEGREEN = PLATEGREEN.split('-')
PLATEGREEN = [hex2color(h) for h in PLATEGREEN]

def visualize(pose_config,
              frames,
              annotations,
              pose_results,
              action_result,
              plate=PLATEBLUE,
              max_num=5):
    """Visualize frames with predicted annotations.

    Args:
        frames (list[np.ndarray]): Frames for visualization, note that
            len(frames) % len(annotations) should be 0.
        annotations (list[list[tuple]]): The predicted spatio-temporal
            detection results.
        pose_results: The pose results.
        action_result (str): The predicted action recognition results.
        pose_model (nn.Module): The constructed pose model.
        plate (str): The plate used for visualization. Default: PLATEBLUE.
        max_num (int): Max number of labels to visualize for a person box.
            Default: 5.

    Returns:
        list[np.ndarray]: Visualized frames.
    """

    act_res = 'tidak berbahaya'
    
    assert max_num + 1 <= len(plate)
    frames_ = cp.deepcopy(frames)
    frames_ = [mmcv.imconvert(f, 'bgr', 'rgb') for f in frames_]
    nf, na = len(frames), len(annotations)
    assert nf % na == 0
    nfpa = len(frames) // len(annotations)
    anno = None
    h, w, _ = frames[0].shape
    scale_ratio = np.array([w, h, w, h])

    # add pose results
    for (i, d, f) in zip(range(na), pose_results, frames_):
        anno = annotations[i]
        if anno is None:
            continue
        for p, ann in enumerate(anno):
            # for j in range(17):
            #     if len(d['keypoints']) != 0:
            #         print(d['keypoints'][p][j][0])
                    # cv2.circle(f, (d['keypoints'][p][j][0] * scale_ratio, d['keypoints'][p][j][1] * scale_ratio), 3, colors_hp[j], -1)
            frames_[i] = f

    for i in range(na):
        anno = annotations[i]
        if anno is None:
            continue
        for j in range(nfpa):
            ind = i * nfpa + j
            frame = frames_[ind]

            # add action result for whole video
            # textsize = cv2.getTextSize(action_result, FONTFACE, FONTSCALE,
            #                            THICKNESS)[0]
            
            # textwidth = textsize[0]
            # location = (10, 14)
            # diag0 = (location[0] + textwidth, location[1] - 14)
            # diag1 = (location[0], location[1] + 22)
            # cv2.rectangle(frame, diag0, diag1, (0, 119, 182), -1)
            # bahaya = ['kekerasan bersenjata', 'kekerasan fisik', 'berbahaya']
            # if action_result in bahaya:
            #     cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
            #                 (255, 0, 0), THICKNESS, LINETYPE)
            # else:
            #     cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
            #                 FONTCOLOR, THICKNESS, LINETYPE)

            # add spatio-temporal action detection results
            for ann in anno:
                box = ann[0]
                label = ann[1]
                if not len(label):
                    continue
                score = ann[2]
                box = (box * scale_ratio).astype(np.int64)
                st, ed = tuple(box[:2]), tuple(box[2:])
                # cv2.rectangle(frame, st, ed, plate[0], 2)

                for k, lb in enumerate(label):
                    if k >= max_num:
                        break
                    text = abbrev(lb)
                    text = ': '.join([text, f'{(score[k]*100):.1f}%'])
                    location = (0 + st[0], 18 + k * 18 + st[1])
                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
                                               THICKNESS)[0]
                    textwidth = textsize[0]
                    diag0 = (location[0] + textwidth, location[1] - 14)
                    diag1 = (location[0], location[1] + 2)
                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
                    bahaya = ['melempar', 'membidik senapan', 'membidik pistol', 'memukul', 'menendang', 'menusuk']
                    if lb in bahaya:
                        cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                    (255, 0, 0), THICKNESS, LINETYPE)
                        # act_res = 'berbahaya'
                    else:
                        cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
                                    FONTCOLOR, THICKNESS, LINETYPE)

                    # textsize = cv2.getTextSize(act_res, FONTFACE, FONTSCALE,
                    #                            THICKNESS)[0]
                    # textwidth = textsize[0]
                    # location = (10, 14)
                    # diag0 = (location[0] + textwidth, location[1] - 14)
                    # diag1 = (location[0], location[1] + 22)
                    # cv2.rectangle(frame, diag0, diag1, (0, 119, 182), -1)
                    # if act_res == 'berbahaya':
                    #     cv2.putText(frame, act_res, (10, 30), FONTFACE, FONTSCALE,
                    #                 (255, 0, 0), THICKNESS, LINETYPE)
                    # else:
                    #     cv2.putText(frame, act_res, (10, 30), FONTFACE, FONTSCALE,
                    #                 FONTCOLOR, THICKNESS, LINETYPE)

    return frames_

In [10]:
video = demo
out_filename = output_dir + '0516_20s_out.mp4'

# rgb-based spatio temporal detection config
rgb_stdet_config =  "mmaction2/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py"
rgb_stdet_checkpoint = "https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth"

# human detection config
det_config = 'mmaction2/demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'
det_checkpoint = 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'
det_score_thr = 0.9
#det_cat_id = 0

# pose estimation config
pose_config = 'mmaction2/demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'

# action classification config
skeleton_config = "mmaction2/configs/skeleton/posec3d/ciis_not-multi.py"
action_score_thr = 0.6
# skeleton-based action recognition checkpoint
skeleton_checkpoint = "https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_k400.pth"
# skeleton-based spatio temporal detection checkpoint
skeleton_stdet_checkpoint = "work_dirs/ciis_not-multi_10_best-550/best_acc_top1_epoch_550.pth"

# use skeleton-based method
use_skeleton_stdet = True
use_skeleton_recog = True

# label_map_stdet = "mmaction2/tools/data/ciis/label_map_no-berdiri.txt"
label_map_stdet = "mmaction2/tools/data/ciis/label_map.txt"
label_map = "mmaction2/tools/data/kinetics/label_map_k400.txt"


# rgb-based action recognition config
rgb_config = "configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py"
rgb_checkpoint = "https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth"

predict_stepsize = 4  # must even int, give out a spatio-temporal detection prediction per n frames
output_stepsize = 1  # show one frame per n frames in the demo, we should have: predict_stepsize % output_stepsize == 0, speedUp/slowDown video output
output_fps = 12  # the fps of demo video output, will speedUp/slowDown video output, must equal to (video_input_fps/output_stepsize) to get normal speed

In [11]:
device = 'cuda:0'

In [12]:
# def parse_args():
#     parser = argparse.ArgumentParser(description='MMAction2 demo')
#     parser.add_argument(
#         '--cfg-options',
#         nargs='+',
#         action=DictAction,
#         default={},
#         help='override some settings in the used config, the key-value pair '
#         'in xxx=yyy format will be merged into config file. For example, '
#         "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
#     args = parser.parse_args()
#     return args

In [13]:
def load_label_map(file_path):
    """Load Label Map.

    Args:
        file_path (str): The file path of label map.

    Returns:
        dict: The label map (int -> label name).
    """
    lines = open(file_path).readlines()
    lines = [x.strip().split(': ') for x in lines]
    return {int(x[0]): x[1] for x in lines}


def abbrev(name):
    """Get the abbreviation of label name:

    'take (an object) from (a person)' -> 'take ... from ...'
    """
    while name.find('(') != -1:
        st, ed = name.find('('), name.find(')')
        name = name[:st] + '...' + name[ed + 1:]
    return name


def pack_result(human_detection, result, img_h, img_w):
    """Short summary.

    Args:
        human_detection (np.ndarray): Human detection result.
        result (type): The predicted label of each human proposal.
        img_h (int): The image height.
        img_w (int): The image width.

    Returns:
        tuple: Tuple of human proposal, label name and label score.
    """
    try:
        human_detection[:, 0::2] /= img_w
        human_detection[:, 1::2] /= img_h
    except:
        pass
    results = []
    if result is None:
        return None
    for prop, res in zip(human_detection, result):
        res.sort(key=lambda x: -x[1])
        results.append(
            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
                                                            for x in res]))
    return results


def expand_bbox(bbox, h, w, ratio=1.25):
    x1, y1, x2, y2 = bbox
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2
    width = x2 - x1
    height = y2 - y1

    square_l = max(width, height)
    new_width = new_height = square_l * ratio

    new_x1 = max(0, int(center_x - new_width / 2))
    new_x2 = min(int(center_x + new_width / 2), w)
    new_y1 = max(0, int(center_y - new_height / 2))
    new_y2 = min(int(center_y + new_height / 2), h)
    return (new_x1, new_y1, new_x2, new_y2)


def cal_iou(box1, box2):
    xmin1, ymin1, xmax1, ymax1 = box1
    xmin2, ymin2, xmax2, ymax2 = box2

    s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
    s2 = (xmax2 - xmin2) * (ymax2 - ymin2)

    xmin = max(xmin1, xmin2)
    ymin = max(ymin1, ymin2)
    xmax = min(xmax1, xmax2)
    ymax = min(ymax1, ymax2)

    w = max(0, xmax - xmin)
    h = max(0, ymax - ymin)
    intersect = w * h
    union = s1 + s2 - intersect
    iou = intersect / union

    return iou


def skeleton_based_action_recognition(skeleton_config, skeleton_checkpoint, device, label_map, pose_results, h, w):
    label_map = [x.strip() for x in open(label_map).readlines()]
    num_class = len(label_map)

    skeleton_config = mmengine.Config.fromfile(skeleton_config)
    skeleton_config.model.cls_head.num_classes = num_class  # for K400 dataset

    skeleton_model = init_recognizer(
        skeleton_config, skeleton_checkpoint, device=device)
    result = inference_skeleton(skeleton_model, pose_results, (h, w))
    action_idx = result.pred_score.argmax().item()
    return label_map[action_idx]


def rgb_based_action_recognition(rgb_config, rgb_checkpoint, device, video, label_map):
    rgb_config = mmengine.Config.fromfile(rgb_config)
    rgb_config.model.backbone.pretrained = None
    rgb_model = init_recognizer(rgb_config, rgb_checkpoint, device)
    action_results = inference_recognizer(rgb_model, video)
    rgb_action_result = action_results.pred_score.argmax().item()
    label_map = [x.strip() for x in open(label_map).readlines()]
    return label_map[rgb_action_result]

def skeleton_based_stdet(predict_stepsize, skeleton_config, skeleton_stdet_checkpoint, device, action_score_thr, label_map, human_detections, pose_results,
                         num_frame, clip_len, frame_interval, h, w):
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    skeleton_config = mmengine.Config.fromfile(skeleton_config)
    num_class = max(label_map.keys()) + 1  # for AVA dataset (80 + 1), for CIIS dataset (9 + 1) == len(label_map)
    skeleton_config.model.cls_head.num_classes = num_class
    skeleton_stdet_model = init_recognizer(skeleton_config,
                                           skeleton_stdet_checkpoint,
                                           device)

    # skeleton_stdet_model.eval()
    skeleton_predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:  # no people detected
            skeleton_predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        num_frame = len(frame_inds)  # 30

        pose_result = [pose_results[ind] for ind in frame_inds]

        skeleton_prediction = []
        for i in range(proposal.shape[0]):  # num_person
            skeleton_prediction.append([])

            fake_anno = dict(
                frame_dict='',
                label=-1,
                img_shape=(h, w),
                origin_shape=(h, w),
                start_index=0,
                modality='Pose',
                num_clips=1,
                clip_len=clip_len,
                total_frames=num_frame)
            num_person = 1

            num_keypoint = 17
            keypoint = np.zeros(
                (num_person, num_frame, num_keypoint, 2))  # M T V 2
            keypoint_score = np.zeros(
                (num_person, num_frame, num_keypoint))  # M T V

            # pose matching
            person_bbox = proposal[i][:4]  #x1, y1, x2, y2
            area = expand_bbox(person_bbox, h, w)

            for j, poses in enumerate(pose_result):  # num_frame
                max_iou = float('-inf')
                index = -1
                if len(poses['keypoints']) == 0:
                    continue
                for k, bbox in enumerate(poses['bboxes']):  # num_person
                    iou = cal_iou(bbox, area)
                    if max_iou < iou:  # if isBelong
                        index = k
                        max_iou = iou
                keypoint[0, j] = poses['keypoints'][index]
                keypoint_score[0, j] = poses['keypoint_scores'][index]

            fake_anno['keypoint'] = keypoint
            fake_anno['keypoint_score'] = keypoint_score

            output = inference_recognizer(skeleton_stdet_model, fake_anno)
            # for multi-label recognition
            score = output.pred_score.tolist()
            for k in range(len(score)):  # 81
                if k not in label_map:
                    continue
                if score[k] > action_score_thr:
                    skeleton_prediction[i].append((label_map[k], score[k]))
                    
            # crop the image -> resize -> extract pose -> as input for poseC3D

        skeleton_predictions.append(skeleton_prediction)
        prog_bar.update()

    return timestamps, skeleton_predictions


def rgb_based_stdet(rgb_stdet_config, rgb_stdet_checkpoint, device, action_score_thr, predict_stepsize, frames, label_map, human_detections, w, h, new_w,
                    new_h, w_ratio, h_ratio):

    rgb_stdet_config = mmengine.Config.fromfile(rgb_stdet_config)
    # rgb_stdet_config.merge_from_dict(args.cfg_options)

    val_pipeline = rgb_stdet_config.val_pipeline
    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'

    window_size = clip_len * frame_interval
    num_frame = len(frames)
    # Note that it's 1 based here
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           predict_stepsize)

    # Get img_norm_cfg
    img_norm_cfg = dict(
        mean=np.array(rgb_stdet_config.model.data_preprocessor.mean),
        std=np.array(rgb_stdet_config.model.data_preprocessor.std),
        to_rgb=False)

    # Build STDET model
    try:
        # In our spatiotemporal detection demo, different actions should have
        # the same number of bboxes.
        rgb_stdet_config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
    except KeyError:
        pass

    rgb_stdet_config.model.backbone.pretrained = None
    rgb_stdet_model = init_detector(
        rgb_stdet_config, rgb_stdet_checkpoint, device=device)

    predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmengine.ProgressBar(len(timestamps))
    # for timestamp, proposal in zip(timestamps, human_detections):
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:
            predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)

        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
        # THWC -> CTHW -> 1CTHW
        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
        input_tensor = torch.from_numpy(input_array).to(device)

        datasample = ActionDataSample()
        datasample.proposals = InstanceData(bboxes=proposal)
        datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
        with torch.no_grad():
            result = rgb_stdet_model(
                input_tensor, [datasample], mode='predict')
            scores = result[0].pred_instances.scores
            prediction = []
            # N proposals
            for i in range(proposal.shape[0]):
                prediction.append([])
            # Perform action score thr
            for i in range(scores.shape[1]):
                if i not in label_map:
                    continue
                for j in range(proposal.shape[0]):
                    if scores[j, i] > action_score_thr:
                        prediction[j].append((label_map[i], scores[j,
                                                                   i].item()))
            predictions.append(prediction)
        prog_bar.update()

    return timestamps, predictions

In [14]:
#args = parse_args()
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, original_frames = frame_extract(
    video, 810, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape

In [19]:
np.asarray(original_frames).shape

(122, 1080, 810, 3)

In [20]:
# resize frames to shortside 256
# new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
new_w, new_h = w, h
# frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
frames = original_frames
w_ratio, h_ratio = new_w / w, new_h / h

In [21]:
# Load spatio-temporal detection label_map
stdet_label_map = load_label_map(label_map_stdet)
rgb_stdet_config = mmengine.Config.fromfile(rgb_stdet_config)
#rgb_stdet_config.merge_from_dict(args.cfg_options)
try:
    if rgb_stdet_config['data']['train']['custom_classes'] is not None:
        stdet_label_map = {
            id + 1: stdet_label_map[cls]
            for id, cls in enumerate(rgb_stdet_config['data']['train']
                                     ['custom_classes'])
        }
except KeyError:
    pass

In [22]:
# action_result = None
# if use_skeleton_recog:
#     print('Use skeleton-based recognition')
#     action_result = skeleton_based_action_recognition(
#         skeleton_config, skeleton_checkpoint, device, label_map, pose_results, h, w)
# else:
#     print('Use rgb-based recognition')
#     action_result = rgb_based_action_recognition(rgb_config, rgb_checkpoint, device, video, label_map)

In [23]:
stdet_preds = None
if use_skeleton_stdet:
    print('Use skeleton-based SpatioTemporal Action Detection')
    # clip_len, frame_interval = 30, 1
    clip_len, frame_interval = predict_stepsize, 1
    timestamps, stdet_preds = skeleton_based_stdet(predict_stepsize,
                                                   skeleton_config,
                                                   skeleton_stdet_checkpoint,
                                                   device,
                                                   action_score_thr,
                                                   stdet_label_map,
                                                   human_detections,
                                                   pose_results, num_frame,
                                                   clip_len,
                                                   frame_interval, h, w)
    for i in range(len(human_detections)):
        det = human_detections[i]
        try:
            det[:, 0:4:2] *= w_ratio
            det[:, 1:4:2] *= h_ratio
            human_detections[i] = torch.from_numpy(det[:, :4]).to(device)
        except:
            pass

else:
    print('Use rgb-based SpatioTemporal Action Detection')
    for i in range(len(human_detections)):
        det = human_detections[i]
        # if type(det[:, 0:4:2]) == type((0, 0)):
        #     det[:, 0:4:2] *= w_ratio
        #     det[:, 1:4:2] *= h_ratio
        # else:


        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(device)
    timestamps, stdet_preds = rgb_based_stdet(rgb_stdet_config,
                                              rgb_stdet_checkpoint,
                                              device,
                                              action_score_thr,
                                              predict_stepsize,
                                              frames,
                                              stdet_label_map,
                                              human_detections, w, h,
                                              new_w, new_h, w_ratio,
                                              h_ratio)

Use skeleton-based SpatioTemporal Action Detection
Loads checkpoint by local backend from path: work_dirs/ciis_not-multi_10_best-550/best_acc_top1_epoch_550.pth
Performing SpatioTemporal Action Detection for each clip
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>   ] 28/30, 48.5 task/s, elapsed: 1s, ETA:     0s

In [24]:
stdet_results = []
for timestamp, prediction in zip(timestamps, stdet_preds):
    human_detection = human_detections[timestamp - 1]
    stdet_results.append(
        pack_result(human_detection, prediction, new_h, new_w))

def dense_timestamps(timestamps, n):
    """Make it nx frames."""
    old_frame_interval = (timestamps[1] - timestamps[0])
    start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
    new_frame_inds = np.arange(
        len(timestamps) * n) * old_frame_interval / n + start
    return new_frame_inds.astype(np.int64)

dense_n = int(predict_stepsize / output_stepsize)
# output_timestamps = dense_timestamps(timestamps, dense_n)
output_timestamps = dense_timestamps(timestamps, dense_n) + 1
imgs = [imgs[timestamp] for timestamp in output_timestamps]
frames = [
    cv2.imread(frame_paths[timestamp - 1])
    # cv2.imread("../../../Downloads/1280x720-white-solid-color-background.jpg")
    for timestamp in output_timestamps
]

In [25]:
# #loop until found berbahaya
# action_result = 'tidak berbahaya'
# bahaya = ['melempar', 'membidik senapan', 'membidik pistol', 'memukul', 'menendang', 'menusuk']
# for prediction_step in stdet_results:
#     for person_prop in prediction_step:
#         for label in person_prop[1]:
#             if label in bahaya:
#                 action_result = 'berbahaya'

In [26]:
len(output_timestamps)

120

In [27]:
vis_frames = visualize(pose_config, np.array(imgs), stdet_results, pose_results,
                       "tidak berbahaya")
vid = mpy.ImageSequenceClip(vis_frames, fps=output_fps)
vid.write_videofile(out_filename)
tmp_dir.cleanup()

Moviepy - Building video ../Thermal/CenterNet-ThermalPose-master/osd/0516_20s_out.mp4.
Moviepy - Writing video ../Thermal/CenterNet-ThermalPose-master/osd/0516_20s_out.mp4



                                                                                                     

Moviepy - Done !
Moviepy - video ready ../Thermal/CenterNet-ThermalPose-master/osd/0516_20s_out.mp4
