In [2]:
import dlib
import yaml
import cv2
import numpy as np
import torch
import torchvision
from torchvision import transforms
from PIL import Image
import time

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import math
from backbone.repvgg import get_RepVGG_func_by_name
import utils

class SixDRepNet(nn.Module):
    def __init__(self,
                 backbone_name, backbone_file, deploy,
                 bins=(1, 2, 3, 6),
                 droBatchNorm=nn.BatchNorm2d,
                 pretrained=True):
        super(SixDRepNet, self).__init__()

        repvgg_fn = get_RepVGG_func_by_name(backbone_name)
        backbone = repvgg_fn(deploy)
        if pretrained:
            checkpoint = torch.load(backbone_file)
            if 'state_dict' in checkpoint:
                checkpoint = checkpoint['state_dict']
            ckpt = {k.replace('module.', ''): v for k,
                    v in checkpoint.items()}  # strip the names
            backbone.load_state_dict(ckpt)

        self.layer0, self.layer1, self.layer2, self.layer3, self.layer4 = backbone.stage0, backbone.stage1, backbone.stage2, backbone.stage3, backbone.stage4
        self.gap = nn.AdaptiveAvgPool2d(output_size=1)

        last_channel = 0
        for n, m in self.layer4.named_modules():
            if ('rbr_dense' in n or 'rbr_reparam' in n) and isinstance(m, nn.Conv2d):
                last_channel = m.out_channels

        fea_dim = last_channel

        self.linear_reg = nn.Linear(fea_dim, 6)

    def forward(self, x):

        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x= self.gap(x)
        x = torch.flatten(x, 1)
        x = self.linear_reg(x)
        return utils.compute_rotation_matrix_from_ortho6d(x)


In [6]:
model = torch.load("HEAD_POSE_MODEL.pt")
model.eval()

SixDRepNet(
  (layer0): RepVGGBlock(
    (nonlinearity): ReLU()
    (se): Identity()
    (rbr_reparam): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (layer1): Sequential(
    (0): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
    (1): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2)
    )
    (2): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (3): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2)
    )
  )
  (layer2): Sequential(
    (0): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      

In [4]:
model = SixDRepNet(backbone_name='RepVGG-B1g2',
                    backbone_file='',
                    deploy=True,
                    pretrained=False)
saved_state_dict = torch.load("6DRepNet_300W_LP_BIWI.pth")
if 'model_state_dict' in saved_state_dict:
    model.load_state_dict(saved_state_dict['model_state_dict'])
else:
    model.load_state_dict(saved_state_dict)

model.eval()
model.cuda()
# print(model)

SixDRepNet(
  (layer0): RepVGGBlock(
    (nonlinearity): ReLU()
    (se): Identity()
    (rbr_reparam): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (layer1): Sequential(
    (0): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
    (1): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2)
    )
    (2): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (3): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      (rbr_reparam): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2)
    )
  )
  (layer2): Sequential(
    (0): RepVGGBlock(
      (nonlinearity): ReLU()
      (se): Identity()
      

In [7]:
modelFile = "res10_300x300_ssd_iter_140000.caffemodel"
configFile = "deploy.prototxt"
net = cv2.dnn.readNetFromCaffe(configFile, modelFile)

face_detector = dlib.get_frontal_face_detector()
# landmark_detector = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

In [8]:
with open("config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.loader.FullLoader)
video = cv2.VideoCapture("images/NTHUDDD_dataset/Training_Evaluation_Dataset/Training Dataset/036/noglasses/nonsleepyCombination.avi")
valid, img = video.read()
for i in range(0, 395):
    valid, img = video.read()

#### DNN FACE DETECTION

In [9]:
h, w = img.shape[:2]
blob = cv2.dnn.blobFromImage(cv2.resize(img, (300, 300)), 1.0,
(300, 300), (104.0, 117.0, 123.0))
net.setInput(blob)
faces = net.forward()
#to draw faces on image
drawn_image_dnn = img.copy()
faces_dnn = []
factor = 0.2
bestBox = ()
for i in range(faces.shape[2]):
    confidence = faces[0, 0, i, 2]
    if confidence > 0.5:
        box = faces[0, 0, i, 3:7] * np.array([w, h, w, h])
        (x, y, x1, y1) = box.astype("int")
        width = abs(x1 - x)
        height = abs(y1 - y)
        minX = max(0, int(x) - int(factor*width))
        minY = max(0, int(y) - int(factor*height))
        maxX = int(x1) + int(factor*width)
        maxY = int(y1) + int(factor*height)
        bestBox = [minX, minY, maxX, maxY]
        print(bestBox)
        faces_dnn.append(bestBox)
        # drawn_image_dnn = cv2.rectangle(drawn_image_dnn, (x, y), (x1, y1), (0, 0, 255), 2)

(x, y, x1, y1) = bestBox
face_img = drawn_image_dnn[y:y1, x:x1]

[200, 52, 371, 328]


In [10]:
transformations = transforms.Compose([transforms.Resize(224),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

In [11]:
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()

    h, w = frame.shape[:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0,
    (300, 300), (104.0, 117.0, 123.0))
    net.setInput(blob)
    faces = net.forward()
    #to draw faces on image
    drawn_image_dnn = frame.copy()
    faces_dnn = []
    factor = 0.2
    bestBox = ()
    for i in range(faces.shape[2]):
        confidence = faces[0, 0, i, 2]
        if confidence > 0.5:
            box = faces[0, 0, i, 3:7] * np.array([w, h, w, h])
            (x, y, x1, y1) = box.astype("int")
            width = abs(x1 - x)
            height = abs(y1 - y)
            minX = max(0, int(x) - int(factor*width))
            minY = max(0, int(y) - int(factor*height))
            maxX = int(x1) + int(factor*width)
            maxY = int(y1) + int(factor*height)
            bestBox = [minX, minY, maxX, maxY]
            print(bestBox)
            faces_dnn.append(bestBox)
            # drawn_image_dnn = cv2.rectangle(drawn_image_dnn, (x, y), (x1, y1), (0, 0, 255), 2)

    (x, y, x1, y1) = bestBox
    face_img = drawn_image_dnn[y:y1, x:x1]


def estimate_head_pose_model(face_img, facebox):
    img = Image.fromarray(face_img)
    img = img.convert('RGB')
    img = transformations(img)

    img = torch.Tensor(img[None, :]).cuda(0)

    start = time.time()
    R_pred = model(img)
    end = time.time()

    euler = utils.compute_euler_angles_from_rotation_matrices(
        R_pred)*180/np.pi
    pitch = euler[:, 0].cpu()
    yaw = euler[:, 1].cpu()
    roll = euler[:, 2].cpu()

    return {"yaw": yaw, "pitch": pitch, "roll": roll}

[291, 126, 524, 445]
Head pose estimation: 666.033745 ms
(tensor([-17.6814], grad_fn=<CopyBackwards>), tensor([2.9073], grad_fn=<CopyBackwards>), tensor([-3.5567], grad_fn=<CopyBackwards>))
[287, 104, 506, 421]
Head pose estimation: 42.999744 ms
(tensor([-8.4753], grad_fn=<CopyBackwards>), tensor([5.1045], grad_fn=<CopyBackwards>), tensor([-4.0752], grad_fn=<CopyBackwards>))
[279, 106, 498, 422]
Head pose estimation: 32.000542 ms
(tensor([-9.0314], grad_fn=<CopyBackwards>), tensor([18.9989], grad_fn=<CopyBackwards>), tensor([-3.3609], grad_fn=<CopyBackwards>))
[233, 116, 460, 407]
Head pose estimation: 32.000065 ms
(tensor([0.8258], grad_fn=<CopyBackwards>), tensor([70.5785], grad_fn=<CopyBackwards>), tensor([11.8135], grad_fn=<CopyBackwards>))
[173, 157, 413, 440]
Head pose estimation: 202.997684 ms
(tensor([-13.5300], grad_fn=<CopyBackwards>), tensor([13.0091], grad_fn=<CopyBackwards>), tensor([-34.9199], grad_fn=<CopyBackwards>))
[245, 123, 469, 438]
Head pose estimation: 195.021629

KeyboardInterrupt: 

In [5]:
torch.save(model, "HEAD_POSE_MODEL.pt")

#### DLIB FACE DETECTION

In [None]:
# faces_dlib = face_detector(img, 1)
# print(len(faces_dlib))
# print(faces_dlib)
# x = faces_dlib[0].left()
# y = faces_dlib[0].top()
# x1 = faces_dlib[0].right()
# y1 = faces_dlib[0].bottom()
# w = x1 - x
# h = y1 - y

# drawn_image = img.copy()
# drawn_image = cv2.rectangle(drawn_image, (x, y), (x1, y1), (0, 0, 255), 2)
# cv2.imshow("", drawn_image)
# cv2.waitKey()

1
rectangles[[(167, 116) (322, 270)]]


-1

In [None]:
# landmark_tuple = []
# img_copy = img.copy()
# for k, d in enumerate(faces_dlib):
#    # rect = dlib.rectangle(*d)
#    landmarks = landmark_detector(img, d)
#    for n in range(0, 68):
#       x = landmarks.part(n).x
#       y = landmarks.part(n).y
#       landmark_tuple.append((x, y))
#       cv2.circle(img_copy, (x, y), 1, (255, 255, 0), -1)

# cv2.imshow(' ', img_copy)
# cv2.waitKey()

-1