In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import collections

In [2]:
from object_detection.utils import ops as utils_ops
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util

# patch tf1 into `utils.ops`
utils_ops.tf = tf.compat.v1

# Patch the location of gfile
tf.gfile = tf.io.gfile

In [3]:
def load_model(model_path):
    model = tf.saved_model.load(model_path)
    model = model.signatures['serving_default']

    return model

# Load face detector
PATH_TO_LABELS = './models/label_map.pbtxt'
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)
detection_model_path = './models/face_tf_trt_FP16'

# Load hand sign recognizer
hand_sign_classes = ["0_front", "1_back", "1_front", "2_back", "2_front", "5_front", "ILU"]
classification_model_path = './models/hand_sign_tf_trt_FP16'

face_detector = load_model(detection_model_path)
hand_sign_recongizer = load_model(classification_model_path)

In [4]:
def run_inference_for_single_image(model, image):
    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]

    # Run inference
    output_dict = model(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() 
                    for key,value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)

    return output_dict

In [5]:
def convert_to_absolute(im_height, im_width, box):
    box_abs = []
    box_abs = [box[0] * im_height,
               box[1] * im_width,
               box[2] * im_height,
               box[3] * im_width]
    
    return box_abs

In [6]:
def convert_to_hand_id_dict(face_id_dict):
    hand_id_dict = {}
    for face_id in face_id_dict.items():
        face_box = face_id[0]
        
        y_min = face_box[0]
        x_min = face_box[1]
        y_max = face_box[2]
        x_max = face_box[3]

        hand_box = [y_min, x_min-(x_max-x_min), y_max, x_min]

        x_offset = (hand_box[3]-hand_box[1])*0.5
        y_offset = (hand_box[2]-hand_box[0])*0.5

        hand_box[1] -= 2*x_offset
        hand_box[2] += y_offset

        for i in range(len(hand_box)):
            if(hand_box[i] <= 0.0):
                hand_box[i] = 0.0
            elif(hand_box[i] >= 1.0):
                hand_box[i] = 1.0
        
        hand_id_dict[tuple(hand_box)] = face_id[1]

    return hand_id_dict

In [7]:
def predict_hand_sign(image):
    image = cv2.resize(image, (150, 150))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    input_img = np.expand_dims(image, axis=0)    
    input_img = input_img.astype(np.float32) / 255.
    input_tensor = tf.constant(input_img)
    
    result = hand_sign_recongizer(input_tensor)
    preds = result['dense_1'].numpy()
    return preds

    
def predict_hand_sign_and_visualize(image, hand_id_dict):
    im_height, im_width, _ = image.shape
    
    box_to_display_str_map = collections.defaultdict(list)
    box_to_track_ids_map = {}

    for hand_id in hand_id_dict.items():
        hand_box = convert_to_absolute(im_height, im_width, list(hand_id[0]))
        print(hand_box)
        hand_img = image[int(hand_box[0]):int(hand_box[2]),
                         int(hand_box[1]):int(hand_box[3])]
        
        if hand_img.size == 0:
            continue
            
        preds = predict_hand_sign(np.array(hand_img))
        
        if np.amax(preds[0]) > 0.85:
            class_idx = np.argmax(preds[0])
            class_name = hand_sign_classes[class_idx]
            class_score = int(np.amax(preds[0])*100)
            display_str = str(class_name)
            display_str = '{}: {}%'.format(display_str, class_score)
        else:
            display_str = 'Try Again'
        
        box_to_display_str_map[hand_id[0]].append(display_str)
            
    for box, display_str in box_to_display_str_map.items():
        ymin, xmin, ymax, xmax = box
        vis_util.draw_bounding_box_on_image_array(image,
                                                  ymin,
                                                  xmin,
                                                  ymax,
                                                  xmax,
                                                  color='green',
                                                  thickness=4,
                                                  display_str_list=display_str,
                                                  use_normalized_coordinates=True)
    
#     return box_to_track_ids_map

In [8]:
# Frame's Width, Height
FRAME_WIDTH = 640
FRAME_HEIGHT = 480

# Initialize webcam feed
video = cv2.VideoCapture(0)

ret = video.set(3, FRAME_WIDTH)
ret = video.set(4, FRAME_HEIGHT)

# Maximum objects
person_ids = list(range(10))

while(True):
    ret, frame = video.read()
    output_dict = run_inference_for_single_image(face_detector, cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    face_id_dict = vis_util.visualize_boxes_and_labels_on_image_array(image=frame,
                                                                      boxes=output_dict['detection_boxes'],
                                                                      classes=output_dict['detection_classes'],
                                                                      scores=output_dict['detection_scores'],
                                                                      category_index=category_index,
                                                                      track_ids = person_ids,
                                                                      use_normalized_coordinates=True,
                                                                      line_thickness=8)
    
    hand_id_dict = convert_to_hand_id_dict(face_id_dict)
    predict_hand_sign_and_visualize(frame, hand_id_dict)
    
    cv2.imshow('Object detector', frame)
    
    # Press 'esc' to quit
    if cv2.waitKey(1) == 27:
        break

# Clean up
video.release()
cv2.destroyAllWindows()

[53.19243907928467, 53.154335021972656, 307.6220655441284, 336.3675308227539]
[52.00632333755493, 56.41529083251953, 308.92715334892273, 337.1500015258789]
[50.93723773956299, 53.16112518310547, 310.21153450012207, 336.45519256591797]
[44.54976797103882, 53.96453857421875, 311.8705093860626, 336.5299987792969]
[45.145812034606934, 55.553321838378906, 310.6009912490845, 336.90357208251953]
[43.528404235839844, 38.83811950683594, 316.4129590988159, 328.6255645751953]
[42.43319034576416, 34.24598693847656, 317.7179145812988, 327.69065856933594]
[42.27416753768921, 39.12261962890625, 317.40949273109436, 329.09698486328125]
[42.35780954360962, 38.373565673828125, 317.2157084941864, 328.690185546875]
[43.35826635360718, 40.43689727783203, 317.1479380130768, 329.1104507446289]
[42.89018154144287, 39.08924102783203, 316.95992946624756, 329.11243438720703]
[43.906567096710205, 37.34039306640625, 316.62668108940125, 328.4942626953125]
[45.323216915130615, 35.92926025390625, 315.0542986392975, 32

[35.28618335723877, 48.747406005859375, 326.0166120529175, 343.3744812011719]
[37.68887758255005, 49.11857604980469, 322.4871289730072, 343.47251892089844]
[36.557815074920654, 48.916358947753906, 324.83630776405334, 343.4262466430664]
[36.95350170135498, 50.107765197753906, 323.8670825958252, 343.8167190551758]
[41.5455436706543, 51.18263244628906, 319.15533542633057, 343.2445526123047]
[39.37002897262573, 49.91222381591797, 321.88735127449036, 343.3164596557617]
[37.826220989227295, 51.573829650878906, 326.85410141944885, 343.82999420166016]
[38.18743944168091, 52.039031982421875, 324.63891863822937, 343.7403869628906]
[39.43936586380005, 51.74415588378906, 321.18715167045593, 343.8109588623047]
[41.8630313873291, 48.63903045654297, 321.767635345459, 343.03539276123047]
[36.93979740142822, 48.69850158691406, 323.0616760253906, 342.84385681152344]
[40.557496547698975, 52.275428771972656, 321.9250738620758, 343.81290435791016]
[37.428460121154785, 49.74609375, 321.2510633468628, 342.99

[18.769032955169678, 33.425254821777344, 361.6345989704132, 373.1423568725586]
[20.381669998168945, 33.289794921875, 359.3125534057617, 373.26904296875]
[19.224514961242676, 37.01637268066406, 359.89134550094604, 374.5305633544922]
[20.314664840698242, 35.732688903808594, 361.64228439331055, 374.14867401123047]
[22.48467206954956, 35.615997314453125, 357.7918589115143, 374.3684387207031]
[24.347012042999268, 37.4920654296875, 358.6745488643646, 375.55023193359375]
[24.236905574798584, 41.597442626953125, 358.6597788333893, 376.3087463378906]
[25.947418212890625, 39.59739685058594, 359.4029903411865, 376.44996643066406]
[24.72313642501831, 41.167030334472656, 359.3365967273712, 376.55506134033203]
[25.732076168060303, 43.2049560546875, 357.9377281665802, 376.5763854980469]
[23.26251983642578, 44.94163513183594, 358.59829902648926, 377.04750061035156]
[24.582996368408203, 42.3687744140625, 357.9293918609619, 376.1883544921875]
[25.361194610595703, 44.895973205566406, 354.32589054107666, 