# Testing on a General Video
The trained model is used to test the mean heart rate in every 10-second-window of a video file in its first 15 seconds.
### Pre-setting
- Import relevant modules
- Define preconfigured global variables

In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.dataset import random_split

file = './general_video.avi'
FPS = 61
WINDOW_LEN = FPS * 10 # frame number for 10-second video clips
time_len = 15 * FPS

### Extract the ROI sequence from frames of the general video
- detect frontal face bounding box
- Align the bounding box using the predicted facial landmarks
- Crop the bounding box and obtain test data: (time_len, 3, 256, 256)

In [2]:
import cv2
import dlib
from imutils import face_utils

def extract_bd_box_from_video(video):
    vc = cv2.VideoCapture(video)
    rois = []
    idx = 0
    rval, frame = vc.read()

    while rval and idx < time_len:
        detector = dlib.get_frontal_face_detector()
        faces = detector(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY), 0)

        if len(faces) == 0 or len(faces) > 1:
            print("{0} faces are detected".format(len(faces)))
            rect = last_rect
        else:
            '''Narrow down the bounding box'''
            rect = faces[0]
        last_rect = rect

        '''Align the bounding box using the predicted facial landmarks'''
        predictor = dlib.shape_predictor("dataset/shape_predictor_68_face_landmarks.dat")
        bd_box = face_utils.FaceAligner(predictor, desiredFaceWidth=256).align(frame, cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY), rect) # bd_box.shape = (256, 256, 3)

        '''Crop the bounding box'''
        h, w, c = bd_box.shape
        h_percent = 0.7
        w_percent = 0.7
        cut_x = int(w * (1 - w_percent) * 0.5)
        cut_y = int(h * (1 - h_percent) * 0.5)
        bd_box = bd_box[cut_y: h - cut_y, cut_x: w - cut_x]
        if idx % 100 == 0:
            cv2.imwrite("../results/boxes/" + str(idx) + '.jpg', bd_box) # save the image locally

        rois.append(bd_box.tolist())
        rval, frame = vc.read()
        idx += 1
    return rois

print("Start to extract roi sequence for file:", file)
sequence = extract_bd_box_from_video(file)
print("The roi sequence for file ", file, " has been extracted")

Start to extract roi sequence for file: ./general_video.avi
The roi sequence for file  ./general_video.avi  has been extracted


### Construct the testing dataset 
- Pre-process the data of extracted ROI sequence and obtain samples:   
resize -> bandpath filter -> apply sliding window strategy to construct samples in shape (sample_num, window_len, c=3, w=36, h=36)
- Construct the testing dataset:   
    **Frame input (x_appearance)**: the original content of the resized ROI sequence, which is used in the attention mechanism.  
    **difference input (x_motion)**: the time-domain discrete derivative of the resized ROI sequence, which is the mainsource of pulse information in the 3D-CNN network. 

In [3]:
from scipy.signal import filtfilt
from pre_processing.utils import build_bandpass_filter

order = 128
b = build_bandpass_filter(FPS, order, False)

sequence = np.transpose(sequence, (0, 3, 1, 2))
avg_pool = nn.AdaptiveAvgPool2d((36, 36))
sequence = avg_pool(torch.from_numpy(sequence).float())
_, _, w, h = sequence.size()
for i in range(w):
    for j in range(h):
        for c in range(3):
            sequence[:, c, i, j] = torch.from_numpy(filtfilt(b, np.array([1]), sequence[:, c, i, j], axis=0).copy())

sequence = sequence.tolist()
total_length = len(sequence)
x_appearance = np.array([sequence[i: i + WINDOW_LEN] 
                         for i in range(0, total_length - WINDOW_LEN, FPS)])
x_motion = []
for window in x_appearance:
    t = window[1] - window[0]
    diff = [window[i] - window[i-1] for i in range(1, len(window))]
    x_motion.append(diff)
x_appearance = np.transpose(x_appearance[:, :-1, :, :, :], (0, 2, 1, 3, 4))
x_motion = np.transpose(np.array(x_motion), (0, 2, 1, 3, 4))
# print(x_appearance.shape, x_motion.shape)
x_tensor = torch.from_numpy(np.stack((x_motion, x_appearance), axis=1)).float()
print(x_tensor.size())

torch.Size([5, 2, 3, 609, 36, 36])


### Model Testing
- Predict the mean heart rate values in every 10-second-window using the trained model on the testing dataset

In [4]:
print(x_tensor.size())

def test_accuracy(model, x_tensor):
    model.eval()
    with torch.no_grad():
        predictions_list = []
#         targets_list = []
        for sequence in x_tensor:
            sequence = sequence.unsqueeze(0)
#             print(sequence.size())
            predictions = model(sequence)
            predictions = predictions.numpy().reshape(-1)
            predictions_list.extend(predictions.tolist())
#         draw_picture(predictions_list, targets_list)
        print(predictions_list)
    return predictions_list
#         print(targets_list)

from models.cnn3d import CNN
model = CNN()
model.load_state_dict(torch.load('./models/best_model.pt')[0])
model.eval()

predictions_list = test_accuracy(model, x_tensor)
with open('./result_of_detection.json', 'w') as outfile:
    print("*********** Save the result of detection for file {0}".format(file))
    json.dump(predictions_list, outfile)

torch.Size([5, 2, 3, 609, 36, 36])
[0.49523571133613586, 0.4952322542667389, 0.495260626077652, 0.49523743987083435, 0.4952356517314911]
*********** Save the result of detection for file ./general_video.avi
