# Practice

Now try to use the model to detect faces in a video. Fill in the gaps in the code blocks below. For more information about the OpenVINO Inference Engine Python API, see the [official documentation](https://docs.openvinotoolkit.org/latest/ie_python_api/annotated.html).

In [1]:
!pip install -r requirements.txt



In [1]:
import os
from pathlib import Path

# Contains all data for the workshop
WORKSHOP_MODEL_PATH = Path('./data') / 'model'

# Path to the Inference Engine model
# But you can use the INT8 model instead
MODEL_PATH_XML = WORKSHOP_MODEL_PATH / 'retinaface-resnet50-pytorch.xml'
MODEL_PATH_BIN = WORKSHOP_MODEL_PATH / 'retinaface-resnet50-pytorch.bin'

DEVICE = 'CPU'

DATA_PATH = Path('./data')
INPUT_VIDEO = str(DATA_PATH / 'input.mp4')
OUTPUT_VIDEO = str(DATA_PATH / 'output.MP4')

In [2]:
from IPython.display import HTML


# Show a source video
HTML(f"""<video width="600" height="400" controls><source src="{INPUT_VIDEO}" type="video/mp4"></video>""")

In [5]:
def prapare_out_video_stream(input_video_stream, output_video_file_path: str):
    width  = int(input_video_stream.get(3))
    height = int(input_video_stream.get(4))
    return cv2.VideoWriter(output_video_file_path, cv2.VideoWriter_fourcc(*'avc1'), 20, (width, height))

In [17]:
# Import OpenCV for work with a video and images
import cv2

# Import the Inference Engine
from openvino.inference_engine import IECore, IENetwork

# Import module for process inference results
from RetinaFacePostProcessing.retinaface_post_processing import RetinaFacePostPostprocessor

import numpy as np

### Step 1: Create an instance of the OpenVINO Inference Engine `IECore` class
This class represents an Inference Engine entity 
and allows you to manipulate plugins using unified interfaces. 

In [7]:
ie = IECore()

### Step 2: Read the prepared model

You need to create an instance of the IENetwork class.
A constructor of this class has two parameters: 
 1. path to the .xml file of the model 
 2. path to the .bin file of the model

In [8]:
net = ie.read_network(MODEL_PATH_XML, MODEL_PATH_BIN)

### Step 3: Get the name of the input layer of the model

To infer a model, you need to know input layers of the model
The object `net` contains information about inputs of the network in a property `inputs`,
which is a dictionary: key - name of the input layer, volume - representation of the input network.
In this case, you need to get only the name of the input. `input_blob` should be a string.

In [9]:
input_name = next(iter(net.input_info))
input_blob = net.input_info[input_name].input_data

print(f'Input layer of the network is {input_name}')

Input layer of the network is data


### Step 4: Get shape (dimensions) of the input layer of the network

* n - number of batches
* c - number of input image channels (usualy 3 - R, G and B) 
* h - height
* w - width

In [10]:
n, c, input_layer_h, input_layer_w = input_blob.shape

print(f'Input shape of the network: [{n}, {c}, {input_layer_h}, {input_layer_w}]')

Input shape of the network: [1, 3, 640, 640]


In [11]:
out_blob = next(iter(net.outputs))

### Step 6: Load the network to a device

Use the instance of `IECore`.
The class `IECore` has a special function called `load_network`, which loads a network to a device.
This function prepares the network for the first inference on the device 
and returns an instance of the network prepared for an inference (execution). 
This function has many parameters, but in this case, you need to know only about two of them:
* `network` - instance of `IENetwork`
* `device_name` - string, contains a device name to infer a model on: CPU, GPU and so on.

In [12]:
network_loaded_to_device = ie.load_network(net, DEVICE)

### Step 7: Open the input video

In [13]:
input_video_stream = cv2.VideoCapture(INPUT_VIDEO)

### Step 8: PreProcessing 

In [18]:
def pre_process(frame: np.ndarray, batch, channels, input_layer_h, input_layer_w) -> np.ndarray:
    # Resize the frame to the network input 
    in_frame = cv2.resize(frame, (input_layer_w, input_layer_h))
    
    # Change the data layout from HWC to CHW
    in_frame = in_frame.transpose((2, 0, 1))  
    
    # Reshape the frame to the network input 
    in_frame = in_frame.reshape((batch, channels, input_layer_h, input_layer_w))
    
    return in_frame

### Step 8: Create an output video stream

In [19]:
output_video_stream = prapare_out_video_stream(input_video_stream, OUTPUT_VIDEO)

### Step 9: Function for processing inference results

In [20]:
def draw_boxes_in_frame(frame, obj):       
    # Step 13: Get the confidence for a discovered object
    confidence =  obj[4]
        
    # Step 14: Draw bounding boxes
    # Draw a bounding box only for objects the confidence of which is greater than a specified threshold
    # Get coordinates of a discovered object
    xmin = int(obj[0])
    ymin = int(obj[1])

    xmax = int(obj[2])
    ymax = int(obj[3])

    # Get confidence for a discovered object
    confidence = round(confidence * 100, 1)

    # Draw a box and a label
    color = (0, 255, 0)
    cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2)


    # Create the title of an object
    text = f'{confidence}%'

    # Put the title to a frame
    cv2.putText(frame, text, (xmin, ymin - 7), cv2.FONT_HERSHEY_COMPLEX, 2, color, 2)

### Step 10: inference face detection function

In [21]:
def face_detection(frame: np.ndarray) -> np.ndarray:
    feed_dict = {
        input_name: in_frame
    }
    
    # All is ready for the main thing - inference!
    # You have read and loaded the network to the device, prepared input data and now you are ready to infer.
    
    # Step 11:
    # To start an inference, call the `infer` function of the `network_loaded_to_device` variable. 
    # We must set input data (a dictionary).
    inference_result = network_loaded_to_device.infer(feed_dict)
    
    # Great! The `inference_result` variable contains output data after inference of the network.
    # `inference_result` is a dictionary, 
    #  where key is the name of the output name, 
    #        value is data from the blob.
    
    # Step 12: Then iterate over all discovered objects   
    detected_faces = postprocessor.process_output(inference_result)
    return detected_faces

### Step 10: Loop over frames in the input video

In [23]:
frame_w = int(input_video_stream.get(3))   # float `width`
frame_h = int(input_video_stream.get(4))  # float `height`

postprocessor = RetinaFacePostPostprocessor(origin_image_size=[frame_w, frame_h], input_image_size=[input_layer_h, input_layer_w])
    
while input_video_stream.isOpened():
    # Read the next frame from the intput video 
    ret, frame = input_video_stream.read()
    # Check if the video is over
    if not ret:
        # Exit from the loop if the video is over
        break 
    
    # Prepare frame for inference
    in_frame = pre_process(frame, n, c, input_layer_w, input_layer_h)
    
    
    detected_faces = face_detection(in_frame)
    
    for detected_face in detected_faces:
        draw_boxes_in_frame(frame, detected_face)
    
    # Write the resulting frame to the output stream
    output_video_stream.write(frame)
    
input_video_stream.release()
# Save the resulting video
output_video_stream.release()

In [24]:
from IPython.display import HTML

# Show a source video
HTML(f"""<video width="600" height="400" controls><source src="{OUTPUT_VIDEO}" type="video/mp4"></video>""")

Do you see boxes in the video? 
If yes, you did all right!
**Good Work!** 

## Section 16: Practice (Part 2)

What is the next step? Often from neural networks build pipelines. It is to use the results of the first neural network as an input for the next neural network. 
Let's try to build a pipeline from two networks:  first is finds a person on the video and the next to recognize the emotions of this person

We have already run the first network. And find the person on the video.
The next step is to find a network for emotion recognition.
There is a good neural network in the [OpenModelZOO](https://docs.openvinotoolkit.org/2019_R1/_docs_Pre_Trained_Models.html) - [emotions-recognition-retail-0003 network](https://docs.openvinotoolkit.org/2019_R1/_emotions_recognition_retail_0003_description_emotions_recognition_retail_0003.html)

### Step 1: Download emotions-recognition-retail-0003 network
Run the Model Downloader eith needed arguments to download the emotions-recognition-retail-0003 network:

In [23]:
!python3 ${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader/downloader.py --name emotions-recognition-retail-0003 --precision FP16 --output_dir data/model

################|| Downloading emotions-recognition-retail-0003 ||################

... 100%, 39 KB, 442 KB/s, 0 seconds passed

... 100%, 4848 KB, 3095 KB/s, 1 seconds passed



This mode already is in OpenVINO format and you do not need to convert it.

After downloading the model you can use it:

### Step 2: Read the prepared model
The IENetwork class is designed to work with a model in the Inference Engine. This class contains information about the network model read from the Intermediate Representation and allows you to manipulate some model parameters such as layers affinity and output layers.

You need to create an instance of the IENetwork class. A constructor of this class has two parameters:

path to the .xml file of the model
path to the .bin file of the model

In [26]:
emotion_recognition_network = ie.read_network('data/model/intel/emotions-recognition-retail-0003/FP16/emotions-recognition-retail-0003.xml', 'data/model/intel/emotions-recognition-retail-0003/FP16/emotions-recognition-retail-0003.bin')

### Step 3: Load the network to a device

Use the instance of `IECore`.
The class `IECore` has a special function called `load_network`, which loads a network to a device.
This function prepares the network for the first inference on the device 
and returns an instance of the network prepared for an inference (execution). 
This function has many parameters, but in this case, you need to know only about two of them:
* `network` - instance of `IENetwork`
* `device_name` - string, contains a device name to infer a model on: CPU, GPU and so on.

In [27]:
emotion_recognition_network_loaded_on_device = ie.load_network(emotion_recognition_network, 'CPU')

### Step 4: Open the input video

In [64]:
input_video_stream = cv2.VideoCapture(INPUT_VIDEO)

### Step 5: Create an output video stream

In [65]:
out = prapare_out_video_stream(input_video_stream, OUTPUT_VIDEO)

### Step 6: Prepare a frame and run inference

In [62]:
def emotion_infer(face):
    # Find inputs of the model

    em_input_layer = next(iter(emotion_recognition_network.input_info))
    em_input_blob = emotion_recognition_network.input_info[em_input_layer].input_data

     # Get input shape of the network
    n, c, h, w = em_input_blob.shape

    # Resize the frame to the network input 
    em_in_frame = cv2.resize(frame, (h, w))
    # Reshape the frame to the network input 
    em_in_frame = em_in_frame.reshape((n, c, h, w))
    
    # Find inputs of the model
    em_output_layer = out_blob = next(iter(emotion_recognition_network.outputs))
    
    # Run the inference how you did it early
    em_results = emotion_recognition_network_loaded_on_device.infer({
        em_input_layer: em_in_frame
    })
    # For understanding what is the result of inference this model, check documentation 
    # https://docs.openvinotoolkit.org/latest/_models_intel_emotions_recognition_retail_0003_description_emotions_recognition_retail_0003.html
    return em_results[em_output_layer]

### Step 16: Drow boxes and emotions in a frame

In [51]:
def get_emotion(emotion_inference_result: np.ndarray) -> str:
    emotions = ['neutral', 'happy','sad', 'surprise', 'anger']
    emotion_index = np.argmax(emotion_inference_result.flatten()) 
    return emotions[emotion_index]

In [54]:
def postpprocess(frame, detected_face, emotion_result):
    draw_boxes_in_frame(frame, detected_face)
    emotion = get_emotion(emotion_result)
    # Put the title to a frame
    cv2.putText(frame, emotion, (xmin, ymin + 7), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)

### Step 17: Loop over frames in the input video

In [66]:
import numpy as np

while input_video_stream.isOpened():
    
    # Read the next frame from the intput video 
    ret, frame = input_video_stream.read()
    # Check if the video is over
    if not ret:
        # Exit from the loop if the video is over
        break 
    in_frame = pre_process(frame, n, c, input_layer_w, input_layer_h)
    
    detected_faces = face_detection(in_frame)
    for detected_face in detected_faces:
        xmin = int(detected_face[0])
        ymin = int(detected_face[1])

        xmax = int(detected_face[2])
        ymax = int(detected_face[3])
        
        face = frame[xmin:xmax-xmin, ymin:ymax-ymin]
    
        # Get height and width of the frame
        emotion_recognition_result = emotion_infer(face)
        postpprocess(frame, detected_face, emotion_recognition_result,)
        # Write the resulting frame to the output stream
    
    out.write(frame)
    
input_video_stream.release()
# Save the resulting video
out.release()

Now the person (Artyom) on the resulting video will be detected with emotion:

In [67]:
# Show a source video
HTML(f"""<video width="600" height="400" controls><source src="{OUTPUT_VIDEO}" type="video/mp4"></video>""")

![](pictures/thankyou.PNG)