In [1]:
import os
import cv2
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import collections

In [2]:
from object_detection.utils import ops as utils_ops
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util

# patch tf1 into `utils.ops`
utils_ops.tf = tf.compat.v1

# Patch the location of gfile
tf.gfile = tf.io.gfile

In [3]:
def load_model(model_path):
    model = tf.saved_model.load(model_path)
    model = model.signatures['serving_default']

    return model

# Load face detector
detection_model_path = './models/face_tf_trt_FP16'

# Load hand sign recognizer
hand_sign_classes = ["0_front", "1_back", "1_front", "2_back", "2_front", "5_front", "ILU"]
# classification_model_path = './models/hand_sign_tf_trt_FP16'
# classification_model_path = './models/hand_sign_saved_model'
classification_model_path = './models/hand_sign.h5'

face_detector = load_model(detection_model_path)
# hand_sign_recongizer = load_model(classification_model_path)
hand_sign_recongizer = tf.keras.models.load_model(classification_model_path)

In [4]:
def run_inference_for_single_image(model, image):
    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]

    # Run inference
    output_dict = model(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() 
                    for key,value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)

    return output_dict

In [5]:
def convert_to_absolute(im_height, im_width, box):
    box_abs = []
    box_abs = [box[0] * im_height,
               box[1] * im_width,
               box[2] * im_height,
               box[3] * im_width]
    
    return box_abs


def convert_to_hand_dict(face_dict):
    hand_dict = {}
    for box_to_id in face_dict.items():
        face_box = box_to_id[0]
        
        y_min = face_box[0]
        x_min = face_box[1]
        y_max = face_box[2]
        x_max = face_box[3]

        hand_box = [y_min, x_min-(x_max-x_min), y_max, x_min]

        x_offset = (hand_box[3]-hand_box[1])*0.5
        y_offset = (hand_box[2]-hand_box[0])*0.5

        hand_box[0] -= 0.5*y_offset
        hand_box[1] -= 2.5*x_offset
        hand_box[2] += 1.5*y_offset

        for i in range(len(hand_box)):
            if(hand_box[i] <= 0.0):
                hand_box[i] = 0.0
            elif(hand_box[i] >= 1.0):
                hand_box[i] = 1.0
        
        hand_dict[tuple(hand_box)] = box_to_id[1]

    return hand_dict




In [6]:
# Tensorflow Saved Model style
# def predict_hand_sign(image):
#     image = cv2.resize(image, (150, 150))
#     image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
#     input_img = np.expand_dims(image, axis=0)    
#     input_img = input_img.astype(np.float32) / 255.
#     input_tensor = tf.constant(input_img)
    
#     result = hand_sign_recongizer(input_tensor)
#     preds = result['dense_1'].numpy()
#     return preds

# Keras model(*.h5) style
def predict_hand_sign(image):
    image = cv2.resize(image, (150, 150))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    input_img = np.expand_dims(image, axis=0)    
    input_img = input_img.astype(np.float32) / 255.
    
    preds = hand_sign_recongizer(input_img)
    return preds

In [7]:
def visualize_box(bg_image, box, display_str, color):
    ymin, xmin, ymax, xmax = box
    vis_util.draw_bounding_box_on_image_array(bg_image,
                                              ymin,
                                              xmin,
                                              ymax,
                                              xmax,
                                              color=color,
                                              thickness=4,
                                              display_str_list=display_str,
                                              use_normalized_coordinates=True)
    

def detect_face_and_get_face_dict(image,
                                  boxes,
                                  classes,
                                  scores,
                                  track_ids,
                                  use_normalized_coordinates=False,
                                  max_boxes_to_draw=20,
                                  min_score_thresh=.5,
                                  line_thickness=4):
    
#     box_to_display_str_map = collections.defaultdict(list)
    box_to_track_ids_map = {}
    if not max_boxes_to_draw:
        max_boxes_to_draw = boxes.shape[0]
    for i in range(min(max_boxes_to_draw, boxes.shape[0])):
        if scores is None or scores[i] > min_score_thresh:
            box = tuple(boxes[i].tolist())
            box_to_track_ids_map[box] = track_ids[i]
            
#             display_str = 'face'
#             display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
#             display_str = '{}: ID {}'.format(display_str, track_ids[i])
#             box_to_display_str_map[box].append(display_str)
            
#             for box, display_str in box_to_display_str_map.items():
#                 visualize_box(image, box, display_str, 'LightGreen')
    
    return box_to_track_ids_map


def predict_hand_sign_and_visualize(image, hand_dict):
    im_height, im_width, _ = image.shape
    
    box_to_display_str_map = collections.defaultdict(list)

    for box_to_id in hand_dict.items():
        hand_box = convert_to_absolute(im_height, im_width, list(box_to_id[0]))
        hand_img = image[int(hand_box[0]):int(hand_box[2]),
                         int(hand_box[1]):int(hand_box[3])]
        
        if hand_img.size == 0:
            continue
            
        preds = predict_hand_sign(np.array(hand_img))
        
        if np.amax(preds[0]) > 0.85:
            class_idx = np.argmax(preds[0])
            class_name = hand_sign_classes[class_idx]
            class_score = int(np.amax(preds[0])*100)
            display_str = str(class_name)
            display_str = '{}: {}%'.format(display_str, class_score)
        else:
            display_str = 'Try Again'
        
        box_to_display_str_map[box_to_id[0]].append(display_str)
            
    for box, display_str in box_to_display_str_map.items():
        visualize_box(image, box, display_str, 'LightGrey')

In [8]:
def most_frequent(l):
    if len(l) != 0:
        return max(set(l), key = l.count) 


def get_command_box(image, video, hand_dict, duration):        
    im_height, im_width, _ = image.shape
    box_ids = []
    start_time = time.time()
    
    while(True):
        box_to_display_str_map = collections.defaultdict(list)
        
        for box_to_id in hand_dict.items():
            hand_box = convert_to_absolute(im_height, im_width, list(box_to_id[0]))
            hand_img = image[int(hand_box[0]):int(hand_box[2]),
                             int(hand_box[1]):int(hand_box[3])]
        
            if hand_img.size == 0:
                continue
                
            preds = predict_hand_sign(np.array(hand_img))
            
            if np.amax(preds[0]) > 0.85:
                class_idx = np.argmax(preds[0])
                if class_idx == 5:
                    box_ids.append(box_to_id[1])
                    box_to_display_str_map[box_to_id[0]].append('command_box')
            
        for box, display_str in box_to_display_str_map.items():
            visualize_box(image, box, display_str, 'LightGrey')
            
        cv2.imshow('Frame', image)
        
        # Press 'esc' to quit
        if cv2.waitKey(100) == 27:
            return -1
        
            
        if(time.time()-start_time >= duration):
            break
            
        ret, image = video.read()
        
        if ret is False:
            print("Can't receive frame")
            break
    
    
    command_box_id = most_frequent(box_ids)

    if command_box_id is not None:
        for box_to_id in hand_dict.items():
            if box_to_id[1] == command_box_id:
                return box_to_id[0]
    else:
        return None

In [9]:
def get_command(image, video, command_box, duration):
    predictions = []
    
    im_height, im_width, _ = image.shape
    command_box_abs = convert_to_absolute(im_height, im_width, list(command_box))

    start_time = time.time()
    
    while(True):
        command_image = image[int(command_box_abs[0]):int(command_box_abs[2]),
                              int(command_box_abs[1]):int(command_box_abs[3])]
        
        if command_image.size == 0:
            continue
        
        preds = predict_hand_sign(np.array(command_image))

        if np.amax(preds[0]) > 0.85:
            class_idx = np.argmax(preds[0])
            predictions.append(class_idx)
            class_name = hand_sign_classes[class_idx]
            display_str = str(class_name)
        else:
            display_str = 'Try Again'
            
        visualize_box(image, command_box, [display_str], 'LightGreen')
        cv2.imshow('Frame', image)
        
        # Press 'esc' to quit
        if cv2.waitKey(100) == 27:
            return -1
        
        if(time.time()-start_time >= duration):
            break
            
        ret, image = video.read()
        if ret is False:
            print("Can't receive frame")
            break
    
    if len(predictions) == 0:
        return None

    return most_frequent(predictions)

In [10]:
# Frame's Width, Height
FRAME_WIDTH = 640
FRAME_HEIGHT = 480

# Initialize webcam feed
video = cv2.VideoCapture(0)
# video = cv2.VideoCapture('/home/young/Desktop/test/5.mp4')
if not video.isOpened():
    print("Cannot open video")
    exit()

video.set(cv2.CAP_PROP_FRAME_WIDTH, FRAME_WIDTH)
video.set(cv2.CAP_PROP_FRAME_HEIGHT, FRAME_HEIGHT)   
print('Current FPS : ', video.get(cv2.CAP_PROP_FPS))

max_detection = 10
person_ids = list(range(max_detection))

while(True):
    ret, frame = video.read()
    
    if ret is False:
        print("Can't receive frame")
        break
        
    output_dict = run_inference_for_single_image(face_detector, cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    face_dict = detect_face_and_get_face_dict(image=frame,
                                              boxes=output_dict['detection_boxes'],
                                              classes=output_dict['detection_classes'],
                                              scores=output_dict['detection_scores'],
                                              track_ids = person_ids,
                                              use_normalized_coordinates=True,
                                              max_boxes_to_draw = max_detection,
                                              line_thickness=8)
    
    hand_dict = convert_to_hand_dict(face_dict)
    if hand_dict is not None:
        command_box = get_command_box(frame, video, hand_dict, 2)
        if command_box == -1:
            break;
        # send signal to arduino : command box detected
        
        if command_box is not None:
            command = get_command(frame, video, command_box, 2)
            if command == -1:
                break
            
#             if command is not None:
                # send signal to arduino : command

    # Press 'esc' to quit
    if cv2.waitKey(1) == 27:
        break

# Clean up
video.release()
cv2.destroyAllWindows()

Current FPS :  30.0
