### 1. main.py

In [36]:
%%writefile main.py

import time
import numpy as np
import cv2
#import os

from argparse import ArgumentParser

from input_feeder import InputFeeder
#from mouse_controller import MouseController

from face_detection import ModelFaceDetection
from head_pose_estimation import ModelHeadPoseEstimation
from facial_landmarks_detection import ModelFacialLandmarksDetection
from gaze_estimation import ModelGazeEstimation

def main(args):
    start_model_load_time=time.time()

    # load model
    class_face_detection = ModelFaceDetection(args.model_face_detection, args.device, args.threshold)
    class_face_detection.load_model()

    class_head_pose_estimation = ModelHeadPoseEstimation(args.model_head_pose_estimation, args.device)
    class_head_pose_estimation.load_model()

    class_facial_landmarks_detection = ModelFacialLandmarksDetection(args.model_facial_landmarks_detection, args.device)
    class_facial_landmarks_detection.load_model()

    class_gaze_estimation = ModelGazeEstimation(args.model_gaze_estimation, args.device)
    class_gaze_estimation.load_model()

    total_model_load_time = time.time() - start_model_load_time

    # input image
    feed=InputFeeder(input_type='video', input_file=args.input_path)
    feed.load_data()

    # output
    initial_w, initial_h, initial_fps = feed.get_info()

    counter = 0
    start_inference_time = time.time()

    # debug
    #print("initial_w:{}, initial_h:{}, initial_fps:{}".format(initial_w, initial_h, initial_fps))

    #out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), initial_fps, (initial_w, initial_h), True)
    out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (initial_w, initial_h), True)

    class_face_detection.initial_size(initial_w, initial_h)

    #mc = MouseController(precision='low', speed='slow')
    #mc = MouseController(precision='high', speed='fast')

    for flag, batch in feed.next_batch():
        if not flag:
            break

        counter += 1

        # debug
        #print("batch.shape:{}".format(batch.shape))
        # if batch is not None:

        # face_detection
        cropped_face = class_face_detection.predict(batch)

        # head_pose_estimation
        head_pose_angles = class_head_pose_estimation.predict(cropped_face)

        # debug
        #print("angle_y_fc:{}, angle_p_fc:{}, angle_r_fc:{}".format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2]))

        # facial_landmarks_detection
        left_eye_image, right_eye_image, left_eye_center, right_eye_center= class_facial_landmarks_detection.predict(cropped_face)

        # gaze_estimation
        x, y, gaze_vector = class_gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles)

        cv2.line(cropped_face, left_eye_center, (int(left_eye_center[0] + gaze_vector[0] * 100), int(left_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2)
        cv2.line(cropped_face, right_eye_center, (int(right_eye_center[0] + gaze_vector[0] * 100), int(right_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2)

        # output
        #cv2.imshow('output', batch)
        #cv2.waitKey(30)
        #cv2.imwrite('output.jpg', batch);

        out_video.write(batch)

        # MouseController
        #mc.move(x, y)

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = counter/total_inference_time

    #print("total_model_load_time:{}, total_inference_time:{}, fps:{}".format(total_model_load_time, total_inference_time, fps))
    print(total_inference_time)
    print(fps)
    print(total_model_load_time)

    feed.close()
    cv2.destroyAllWindows()

if __name__=='__main__':
    parser=ArgumentParser()
    parser.add_argument("-mfd", "--model_face_detection", required=True)
    parser.add_argument("-mhpe", "--model_head_pose_estimation", required=True)
    parser.add_argument("-mfld", "--model_facial_landmarks_detection", required=True)
    parser.add_argument("-mge", "--model_gaze_estimation", required=True)

    parser.add_argument("-d", "--device", default="CPU")
    parser.add_argument("-t", "--threshold", default=0.9)
    parser.add_argument("-i", "--input_path", required=True)

    args=parser.parse_args() 
    main(args)

Overwriting main.py


### 2. input_feeder.py

In [3]:
%%writefile input_feeder.py

'''
This class can be used to feed input from an image, webcam, or video to your model.
Sample usage:
    feed=InputFeeder(input_type='video', input_file='video.mp4')
    feed.load_data()
    for batch in feed.next_batch():
        do_something(batch)
    feed.close()
'''
import cv2
from numpy import ndarray

class InputFeeder:
    def __init__(self, input_type, input_file=None):
        '''
        input_type: str, The type of input. Can be 'video' for video file, 'image' for image file,
                    or 'cam' to use webcam feed.
        input_file: str, The file that contains the input image or video file. Leave empty for cam input_type.
        '''
        self.input_type=input_type
        if input_type=='video' or input_type=='image':
            self.input_file=input_file

    def load_data(self):
        if self.input_type=='video':
            self.cap=cv2.VideoCapture(self.input_file)
        elif self.input_type=='cam':
            self.cap=cv2.VideoCapture(0)
        else:
            self.cap=cv2.imread(self.input_file)

        # debug
        #print("self.input_file:{}".format(self.input_file))

    def next_batch(self):
        '''
        Returns the next image from either a video file or webcam.
        If input_type is 'image', then it returns the same image.
        '''
        while True:
            for _ in range(10):
                flag, frame=self.cap.read()
            yield flag, frame

    def get_info(self):
        initial_w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        initial_h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        initial_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
        return initial_w, initial_h, initial_fps

    def close(self):
        '''
        Closes the VideoCapture.
        '''
        if not self.input_type=='image':
            self.cap.release()

Overwriting input_feeder.py


### 3. face_detection.py

In [30]:
%%writefile face_detection.py

'''
This is a sample class for a model. You may choose to use it as-is or make any changes to it.
This has been provided just to give you an idea of how to structure your model class.
'''
import cv2

from openvino.inference_engine import IENetwork, IECore

class ModelFaceDetection:
    '''
    Class for the Face Detection Model.
    '''
    def __init__(self, model_name, device='CPU', threshold=0.9):
        '''
        TODO: Use this to set your instance variables.
        '''
        self.model_weights = model_name+'.bin'
        self.model_structure = model_name+'.xml'
        self.device = device
        self.threshold = threshold

        self.core = IECore()
        self.model = self.core.read_network(model=self.model_structure, weights=self.model_weights)

        self.input_name = next(iter(self.model.input_info))
        self.input_shape = self.model.inputs[self.input_name].shape
        self.output_name = next(iter(self.model.outputs))
        self.output_shape = self.model.outputs[self.output_name].shape

    def load_model(self):
        '''
        TODO: You will need to complete this method.
        This method is for loading the model to the device specified by the user.
        If your model requires any Plugins, this is where you can load them.
        '''
        self.net = self.core.load_network(network=self.model, device_name=self.device, num_requests=1)

    def predict(self, image):
        '''
        TODO: You will need to complete this method.
        This method is meant for running predictions on the input image.
        '''
        frame = self.preprocess_input(image)
        outputs = self.net.infer({self.input_name:frame})
        cropped_face = self.preprocess_output(image, outputs[self.output_name])

        return cropped_face

    def check_model(self):
        pass

    def preprocess_input(self, image):
        '''
        Before feeding the data into the model for inference,
        you might have to preprocess it. This function is where you can do that.
        '''
        # debug
        #print("image.shape:{}".format(image.shape))
        #print("self.input_shape:{}".format(self.input_shape))

        # input shape: BxCxHxW    
        height = self.input_shape[2]
        width = self.input_shape[3]

        image = cv2.resize(image, (width, height))
        image = image.transpose((2,0,1))
        image = image.reshape(1, 3, height, width)

        return image

    def preprocess_output(self, image, outputs):
        '''
        Before feeding the output of this model to the next model,
        you might have to preprocess the output. This function is where you can do that.
        '''
        #coords = []

        # output shape: [1, 1, N, 7]
        for box in outputs[0][0]:

            # only keep probability greater than the threshold
            #if box[2] >= self.threshold:
            if box[2] >= 0.9:
                xmin = int(box[3] * self.width)
                ymin = int(box[4] * self.height)
                xmax = int(box[5] * self.width)
                ymax = int(box[6] * self.height)

                #coords.append([xmin, ymin, xmax, ymax])

                cropped_face = image[ymin:ymax, xmin:xmax]

                cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 1)

        #return coords
        return cropped_face

    def initial_size(self, width, height):
        self.width = width
        self.height = height

Overwriting face_detection.py


In [7]:
### 4. head_pose_estimation.py 

In [31]:
%%writefile head_pose_estimation.py

'''
This is a sample class for a model. You may choose to use it as-is or make any changes to it.
This has been provided just to give you an idea of how to structure your model class.
'''
import cv2

from openvino.inference_engine import IENetwork, IECore

class ModelHeadPoseEstimation:
    '''
    Class for the Face Detection Model.
    '''
    def __init__(self, model_name, device='CPU'):
        '''
        TODO: Use this to set your instance variables.
        '''
        self.model_weights=model_name+'.bin'
        self.model_structure=model_name+'.xml'
        self.device=device

        self.core = IECore()
        self.model = self.core.read_network(model=self.model_structure, weights=self.model_weights)

        self.input_name = next(iter(self.model.inputs))
        self.input_shape = self.model.inputs[self.input_name].shape
        self.output_name = next(iter(self.model.outputs))
        self.output_shape = self.model.outputs[self.output_name].shape

    def load_model(self):
        '''
        TODO: You will need to complete this method.
        This method is for loading the model to the device specified by the user.
        If your model requires any Plugins, this is where you can load them.
        '''
        self.net = self.core.load_network(network=self.model, device_name=self.device, num_requests=1)

    def predict(self, image):
        '''
        TODO: You will need to complete this method.
        This method is meant for running predictions on the input image.
        '''
        frame = self.preprocess_input(image)
        outputs = self.net.infer({self.input_name:frame})

        # debug
        #print("outputs:{}".format(outputs))

        #coords = self.preprocess_output(outputs[self.output_name])
        return self.preprocess_output(outputs)

    def check_model(self):
        pass

    def preprocess_input(self, image):
        '''
        Before feeding the data into the model for inference,
        you might have to preprocess it. This function is where you can do that.
        '''
        # input shape: 1xCxHxW    
        height = self.input_shape[2]
        width = self.input_shape[3]

        image = cv2.resize(image, (width, height))
        image = image.transpose((2,0,1))
        image = image.reshape(1, 3, height, width)

        return image

    def preprocess_output(self, outputs):
        '''
        Before feeding the output of this model to the next model,
        you might have to preprocess the output. This function is where you can do that.
        '''
        # angle_y_fc: [1, 1] - Estimated yaw
        # angle_p_fc: [1, 1] - Estimated pitch
        # angle_r_fc: [1, 1] - Estimated roll
        
        head_pose_angles = []
        head_pose_angles.append(outputs['angle_y_fc'][0][0])
        head_pose_angles.append(outputs['angle_p_fc'][0][0])
        head_pose_angles.append(outputs['angle_r_fc'][0][0])

        return head_pose_angles

Overwriting head_pose_estimation.py


In [9]:
### 5. facial_landmarks_detection.py 

In [32]:
%%writefile facial_landmarks_detection.py

'''
This is a sample class for a model. You may choose to use it as-is or make any changes to it.
This has been provided just to give you an idea of how to structure your model class.
'''
import cv2

from openvino.inference_engine import IENetwork, IECore

class ModelFacialLandmarksDetection:
    '''
    Class for the Face Detection Model.
    '''
    def __init__(self, model_name, device='CPU'):
        '''
        TODO: Use this to set your instance variables.
        '''
        self.model_weights=model_name+'.bin'
        self.model_structure=model_name+'.xml'
        self.device=device

        self.core = IECore()
        self.model = self.core.read_network(model=self.model_structure, weights=self.model_weights)

        self.input_name = next(iter(self.model.inputs))
        self.input_shape = self.model.inputs[self.input_name].shape
        self.output_name = next(iter(self.model.outputs))
        self.output_shape = self.model.outputs[self.output_name].shape

    def load_model(self):
        '''
        TODO: You will need to complete this method.
        This method is for loading the model to the device specified by the user.
        If your model requires any Plugins, this is where you can load them.
        '''
        self.net = self.core.load_network(network=self.model, device_name=self.device, num_requests=1)

    def predict(self, image):
        '''
        TODO: You will need to complete this method.
        This method is meant for running predictions on the input image.
        '''
        frame = self.preprocess_input(image)
        outputs = self.net.infer({self.input_name:frame})

        # debug
        #print("outputs:{}".format(outputs))

        return self.preprocess_output(image, outputs[self.output_name][0])

    def check_model(self):
        pass

    def preprocess_input(self, image):
        '''
        Before feeding the data into the model for inference,
        you might have to preprocess it. This function is where you can do that.
        '''
        # input shape: BxCxHxW    
        height = self.input_shape[2]
        width = self.input_shape[3]

        image = cv2.resize(image, (width, height))
        image = image.transpose((2,0,1))
        image = image.reshape(1, 3, height, width)

        return image

    def preprocess_output(self, image, outputs):
        '''
        Before feeding the output of this model to the next model,
        you might have to preprocess the output. This function is where you can do that.
        '''
        # output shape: [1, 10], containing a row-vector of 10 floating point values for five landmarks coordinates

        # debug
        #print("image.shape:{}".format(image.shape))
        #print("x0:{}, y0:{}".format(outputs[0][0][0], outputs[1][0][0]))
        #print("x1:{}, y1:{}".format(outputs[2][0][0], outputs[3][0][0]))

        height, width, channel = image.shape

        # left eye
        left_eye_xmin = int(outputs[0][0][0] * width - 20)
        left_eye_xmax = int(outputs[0][0][0] * width + 20)
        left_eye_ymin = int(outputs[1][0][0] * height - 20)
        left_eye_ymax = int(outputs[1][0][0] * height + 20)

        left_eye_image = image[left_eye_ymin:left_eye_ymax, left_eye_xmin:left_eye_xmax]
        left_eye_center = int((left_eye_xmin + left_eye_xmax) / 2), int((left_eye_ymin + left_eye_ymax) / 2)
        cv2.rectangle(image, (left_eye_xmin, left_eye_ymin), (left_eye_xmax, left_eye_ymax), (0, 0, 255), 1)

        # right eye
        right_eye_xmin = int(outputs[2][0][0] * width - 20)
        right_eye_xmax = int(outputs[2][0][0] * width + 20)
        right_eye_ymin = int(outputs[3][0][0] * height - 20)
        right_eye_ymax = int(outputs[3][0][0] * height + 20)

        right_eye_image = image[right_eye_ymin:right_eye_ymax, right_eye_xmin:right_eye_xmax]
        right_eye_center = int((right_eye_xmin + right_eye_xmax) / 2), int((right_eye_ymin + right_eye_ymax) / 2)
        cv2.rectangle(image, (right_eye_xmin, right_eye_ymin), (right_eye_xmax, right_eye_ymax), (0, 0, 255), 1)

        # debug
        #print("left_eye_center:{}, right_eye_center:{}".format(left_eye_center, right_eye_center))

        return left_eye_image, right_eye_image, left_eye_center, right_eye_center

Overwriting facial_landmarks_detection.py


In [11]:
### 6. gaze_estimation.py 

In [33]:
%%writefile gaze_estimation.py

'''
This is a sample class for a model. You may choose to use it as-is or make any changes to it.
This has been provided just to give you an idea of how to structure your model class.
'''
import cv2
import math

from openvino.inference_engine import IENetwork, IECore

class ModelGazeEstimation:
    '''
    Class for the Face Detection Model.
    '''
    def __init__(self, model_name, device='CPU'):
        '''
        TODO: Use this to set your instance variables.
        '''
        self.model_weights=model_name+'.bin'
        self.model_structure=model_name+'.xml'
        self.device=device

        self.core = IECore()
        self.model = self.core.read_network(model=self.model_structure, weights=self.model_weights)

        self.input_name = [i for i in self.model.inputs.keys()]

        # debug
        #print("self.input_name:{}".format(self.input_name))
        #print("self.model.inputs[self.input_name[1]].shape:{}".format(self.model.inputs[self.input_name[1]].shape))

        self.input_shape=self.model.inputs[self.input_name[1]].shape
        self.output_name=next(iter(self.model.outputs))
        self.output_shape=self.model.outputs[self.output_name].shape
        
    def load_model(self):
        '''
        TODO: You will need to complete this method.
        This method is for loading the model to the device specified by the user.
        If your model requires any Plugins, this is where you can load them.
        '''
        self.net = self.core.load_network(network=self.model, device_name=self.device, num_requests=1)

    def predict(self, left_eye_image, right_eye_image, head_pose_angles):
        '''
        TODO: You will need to complete this method.
        This method is meant for running predictions on the input image.
        '''
        left_eye_frame = self.preprocess_input(left_eye_image)
        right_eye_frame = self.preprocess_input(right_eye_image)

        net_input = {'left_eye_image': left_eye_frame, 'right_eye_image': right_eye_frame, 'head_pose_angles': head_pose_angles}
        outputs = self.net.infer(inputs=net_input)

        # debug
        #print("outputs:{}".format(outputs))

        return self.preprocess_output(outputs, head_pose_angles)

    def check_model(self):
        pass

    def preprocess_input(self, image):
        '''
        Before feeding the data into the model for inference,
        you might have to preprocess it. This function is where you can do that.
        '''
        # head_pose_angles: BxC
        # left_eye_image: BxCxHxW
        # right_eye_image: BxCxHxW

        height = self.input_shape[2]
        width = self.input_shape[3]

        image = cv2.resize(image, (width, height))
        image = image.transpose((2,0,1))
        image = image.reshape(1, 3, height, width)

        return image

    def preprocess_output(self, outputs, head_pose_angles):
        '''
        Before feeding the output of this model to the next model,
        you might have to preprocess the output. This function is where you can do that.
        '''
        gaze_vector = outputs[self.output_name][0]

        angle_r_fc = head_pose_angles[2]
        r_cos = math.cos(angle_r_fc * math.pi / 180.0)
        r_sin = math.sin(angle_r_fc * math.pi / 180.0)

        x = gaze_vector[0] * r_cos + gaze_vector[1] * r_sin
        y = -gaze_vector[0] * r_sin+ gaze_vector[1] * r_cos

        #debug
        #print("gaze_vector:{}".format(gaze_vector))
        #print("x:{}, y:{}".format(x, y))

        return x, y, gaze_vector

Overwriting gaze_estimation.py


In [13]:
### 7. mouse_controller.py 

In [14]:
%%writefile mouse_controller.py

'''
This is a sample class that you can use to control the mouse pointer.
It uses the pyautogui library. You can set the precision for mouse movement
(how much the mouse moves) and the speed (how fast it moves) by changing 
precision_dict and speed_dict.
Calling the move function with the x and y output of the gaze estimation model
will move the pointer.
This class is provided to help get you started; you can choose whether you want to use it or create your own from scratch.
'''
import pyautogui

class MouseController:
    def __init__(self, precision, speed):
        precision_dict={'high':100, 'low':1000, 'medium':500}
        speed_dict={'fast':1, 'slow':10, 'medium':5}

        self.precision=precision_dict[precision]
        self.speed=speed_dict[speed]

        pyautogui.FAILSAFE = 0

    def move(self, x, y):
        pyautogui.moveRel(x*self.precision, -1*y*self.precision, duration=self.speed)

Writing mouse_controller.py
