In [1]:
import os
import cv2
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import collections

In [2]:
from object_detection.utils import ops as utils_ops
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util

# patch tf1 into `utils.ops`
utils_ops.tf = tf.compat.v1

# Patch the location of gfile
tf.gfile = tf.io.gfile

In [3]:
def load_model(model_path):
    model = tf.saved_model.load(model_path)
    model = model.signatures['serving_default']

    return model

# Load face detector
detection_model_path = './models/face_tf_trt_FP16'

# Load hand sign recognizer
hand_sign_classes = ["0_front", "1_back", "1_front", "2_back", "2_front", "5_front", "ILU"]
# classification_model_path = './models/hand_sign_tf_trt_FP16'
# classification_model_path = './models/hand_sign_saved_model'
classification_model_path = './models/hand_sign.h5'

face_detector = load_model(detection_model_path)
# hand_sign_recongizer = load_model(classification_model_path)
hand_sign_recongizer = tf.keras.models.load_model(classification_model_path)

In [4]:
def run_inference_for_single_image(model, image):
    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]

    # Run inference
    output_dict = model(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() 
                    for key,value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)

    return output_dict

In [5]:
def convert_to_absolute(im_height, im_width, box):
    box_abs = []
    box_abs = [box[0] * im_height,
               box[1] * im_width,
               box[2] * im_height,
               box[3] * im_width]
    
    return box_abs


def convert_to_predict_box(face_box_to_track_ids_map):
    predict_box_to_track_ids_map = {}
    for face_box, track_id in face_box_to_track_ids_map.items():
        y_min = face_box[0]
        x_min = face_box[1]
        y_max = face_box[2]
        x_max = face_box[3]

        predict_box = [y_min, x_min-(x_max-x_min), y_max, x_min]

        x_offset = (predict_box[3]-predict_box[1])*0.5
        y_offset = (predict_box[2]-predict_box[0])*0.5

        predict_box[0] -= 0.5*y_offset
        predict_box[1] -= 2.5*x_offset
        predict_box[2] += 2*y_offset

        for i in range(len(predict_box)):
            if(predict_box[i] <= 0.0):
                predict_box[i] = 0.0
            elif(predict_box[i] >= 1.0):
                predict_box[i] = 1.0
        
        predict_box_to_track_ids_map[tuple(predict_box)] = track_id
        
    return predict_box_to_track_ids_map

In [6]:
# Tensorflow Saved Model style
# def predict_hand_sign(image):
#     image = cv2.resize(image, (150, 150))
#     image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
#     input_img = np.expand_dims(image, axis=0)    
#     input_img = input_img.astype(np.float32) / 255.
#     input_tensor = tf.constant(input_img)
    
#     result = hand_sign_recongizer(input_tensor)
#     preds = result['dense_1'].numpy()
#     return preds


# Keras model(*.h5) style
def predict_hand_sign(image):
    image = cv2.resize(image, (150, 150))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    input_img = np.expand_dims(image, axis=0)    
    input_img = input_img.astype(np.float32) / 255.
    
    preds = hand_sign_recongizer(input_img)
    return preds


def visualize_box(bg_image, box, display_str, color):
    ymin, xmin, ymax, xmax = box
    vis_util.draw_bounding_box_on_image_array(bg_image,
                                              ymin,
                                              xmin,
                                              ymax,
                                              xmax,
                                              color=color,
                                              thickness=4,
                                              display_str_list=display_str,
                                              use_normalized_coordinates=True)

In [7]:
def detect_face_and_get_command_box(image,
                                    boxes,
                                    classes,
                                    scores,
                                    track_ids,
                                    use_normalized_coordinates=False,
                                    max_boxes_to_draw=20,
                                    min_score_thresh=.5,
                                    line_thickness=4):

    # 1. Detect face
    face_box_to_track_ids_map = {}
    if not max_boxes_to_draw:
        max_boxes_to_draw = boxes.shape[0]
    for i in range(min(max_boxes_to_draw, boxes.shape[0])):
        if scores is None or scores[i] > min_score_thresh:
            box = tuple(boxes[i].tolist())
            face_box_to_track_ids_map[box] = track_ids[i]

    
    # 2. Find command sign
    command_box_to_track_ids_map = {}
    
    if len(face_box_to_track_ids_map) != 0:
        im_height, im_width, _ = image.shape
        box_to_display_str_map = collections.defaultdict(list)
        predict_box_to_track_ids_map = convert_to_predict_box(face_box_to_track_ids_map)
        
        for predict_box, track_id in predict_box_to_track_ids_map.items():
            predict_box_abs = convert_to_absolute(im_height, im_width, list(predict_box))
            predict_box_img = image[int(predict_box_abs[0]):int(predict_box_abs[2]),
                                    int(predict_box_abs[1]):int(predict_box_abs[3])]

            # invalid box check
            if predict_box_img.size == 0:
                continue

            preds = predict_hand_sign(np.array(predict_box_img))
            if np.amax(preds[0]) > 0.85:
                # command sign
                if np.argmax(preds[0]) == 5:
                    display_str = 'command_box'
                    display_str = '{}: ID {}'.format(display_str, track_id)
                    box_to_display_str_map[predict_box].append(display_str)
                    command_box_to_track_ids_map[tuple(predict_box)] = track_id
    
        # 3. Visualize command box
        for box, display_str in box_to_display_str_map.items():
            visualize_box(image, box, display_str, 'LightGrey')
            
    return command_box_to_track_ids_map


def predict_hand_sign_and_get_command(image, command_box_to_track_ids_map):
    im_height, im_width, _ = image.shape
    
    for command_box, track_id in command_box_to_track_ids_map.items():
        command_box_abs = convert_to_absolute(im_height, im_width, list(command_box))
        command_box_img = image[int(command_box_abs[0]):int(command_box_abs[2]),
                                int(command_box_abs[1]):int(command_box_abs[3])]

        # invalid box check
        if command_box_img.size == 0:
            continue
        
        preds = predict_hand_sign(np.array(command_box_img))
            
        if np.amax(preds[0]) >= 0.97:
            class_idx = np.argmax(preds[0])
            # command sign
            if class_idx != 5:
                display_str = hand_sign_classes[class_idx]
                display_str = '{}: ID {}'.format(display_str, track_id)
                visualize_box(image, command_box, display_str, 'Green')
                
                return class_idx
            
    return None

In [10]:
def most_frequent(l):
    if len(l) != 0:
        return max(set(l), key = l.count) 
    

# Frame's Width, Height
FRAME_WIDTH = 640
FRAME_HEIGHT = 480

# Initialize webcam feed
video = cv2.VideoCapture(0)
# video = cv2.VideoCapture('/home/young/Desktop/test/5.mp4')
if not video.isOpened():
    print("Cannot open video")
    exit()
    
video.set(cv2.CAP_PROP_FRAME_WIDTH, FRAME_WIDTH)
video.set(cv2.CAP_PROP_FRAME_HEIGHT, FRAME_HEIGHT)   
print('Current FPS : ', video.get(cv2.CAP_PROP_FPS))

max_detection = 10
person_ids = list(range(max_detection))
prev = {}

while(True):
    ret, frame = video.read()
    if ret is False:
        print("Can't receive frame")
        break
        
    output_dict = run_inference_for_single_image(face_detector, cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    command_box_to_track_ids_map = detect_face_and_get_command_box(image=frame,
                                                                   boxes=output_dict['detection_boxes'],
                                                                   classes=output_dict['detection_classes'],
                                                                   scores=output_dict['detection_scores'],
                                                                   track_ids = person_ids,
                                                                   use_normalized_coordinates=True,
                                                                   max_boxes_to_draw = max_detection,
                                                                   line_thickness=8)
        
    cv2.imshow('video', frame)
    # Press 'esc' to quit
    if cv2.waitKey(1) == 27:
        break
    
    if len(command_box_to_track_ids_map) < len(prev):
        start_time = time.time()
        command_list = []
        while(True):
            ret, frame = video.read()
            if ret is False:
                print("Can't receive frame")
                break
                
            command = predict_hand_sign_and_get_command(frame, prev)

            if command is not None:
                command_list.append(command)
                
            cv2.imshow('video', frame)
            # Press 'esc' to quit
            if cv2.waitKey(1) == 27:
                break
            
            if time.time()-start_time >= 2.0:
                print(hand_sign_classes[int(most_frequent(command_list))])
                break
                
    prev = command_box_to_track_ids_map

# Clean up
video.release()
cv2.destroyAllWindows()

Current FPS :  30.0
0_front
1_front
0_front
0_front
