In [41]:
import numpy as np
import cv2
import torch

### YoloV5 object detection using pytorchHub

In [65]:
# This loads th emodel, it's possible to change the accuracy of the model by using more complex yolov5 architectures,
# there exists different type of yolo, 'n' being the second in speed and the second to last in accuracy, so it's a good compromise

yolov5 = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True, _verbose=False)
yolov5.cpu() # Since we are most likely working with cpu only devices let's convert the model to that format.

Using cache found in C:\Users\tommaso.massaglia/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2022-10-3 Python-3.9.0 torch-1.12.1+cpu CPU

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 16, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
  

In [66]:
cap = cv2.VideoCapture(0)

while True:
    _, img = cap.read()
    
    preds = yolov5(img) # This outputs the predictions for the specified img
    preds_f = [preds.pandas().xyxy[0].iloc[i] for i in range(len(preds.pandas().xyxy[0]))] 
    # This prediction format includes the class label as well, easier to work with
    # The output labels are as follows: x1, y1, x2, y2, confidence, class_num, class_label
    
    try: # This is a blanket try_except just in case something breaks (like cv2 trying to draw a rectangle outside the webcam window)
        for pred in preds_f:
            label = pred[6]
            conf = pred[4]
            
            cv2.rectangle(img, (int(pred[0]), int(pred[1])), (int(pred[2]), int(pred[3])), (255, 255, 255), 2) # The output coordinates are floats, cv2 requires ints to work
            cv2.putText(img, f'{label} {conf:.2f}', (int(pred[0]+5), int(pred[3]-5)), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255))

    except:
        continue
     
    cv2.imshow("Webcam", img) # This will open an independent window
    
    if cv2.waitKey(1) & 0xFF==ord('q'): # quit when 'q' is pressed
        cap.release() # Free the webcam resource
        break # Break the cycle and interrupt the detection
        
cv2.destroyAllWindows() # Makes sure that the open cv2 windows are closed

In [59]:
# Here in case the cycle has to be manually broken to clean things up

cap.release()
cv2.destroyAllWindows()

### DNN Face detection using CV2

ref. https://docs.opencv.org/4.x/d0/dd4/tutorial_dnn_face.html?msclkid=bbba05a1af3911eca0d1cf4ec0faac6c

In [71]:
# Get the size of the webcam frame

cap = cv2.VideoCapture(0)
frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

In [72]:
score_threshold, nms_treshold, top_k = 0.4, 0.2, 10 # Detector Hyperparameter

detector = cv2.FaceDetectorYN.create('dnn_cv2.onnx', "", (320,320), score_threshold, nms_treshold, top_k) # Initialize the opencv detector
detector.setInputSize((frameWidth, frameHeight)) # Set the detector input image size, could add a scale parameter

In [83]:
cap = cv2.VideoCapture(0)

while True:
    _, img = cap.read()
    
    preds = detector.detect(img) # This outputs the predictions for the specified img, preds[1] contains the predictions
    
    for pred in preds[1]:
        pred = [int(el) for el in pred] # Convert the coordinates to ints
        cv2.rectangle(img, (pred[0], pred[1]), (pred[0]+pred[2], pred[1]+pred[3]), (255, 255, 255), 2) # Draw a rectangle over the face
     
    cv2.imshow("Webcam", img) # This will open an independent window
    
    if cv2.waitKey(1) & 0xFF==ord('q'): # quit when 'q' is pressed
        cap.release() # Free the webcam resource
        break # Break the cycle and interrupt the detection
        
cv2.destroyAllWindows() # Makes sure that the open cv2 windows are closed