In [1]:
import cv2
import torch
import numpy as np
import torch.nn as nn
from utils.import_data import WiderFaceDataset, TRANSFORM, TRAIN_ROOT, TRAIN_ANN_FILE, TEST_ROOT
from utils.anchors import AnchorMatcher, AnchorGenerator, box_nms, compute_loss_with_anchors

In [2]:
class FaceDetectionNet(nn.Module):
    def __init__(self, num_anchors=1):
        """
        num_anchors: number of boxes predicted per spatial cell (simplest: 1)
        """
        super(FaceDetectionNet, self).__init__()

        """
        kernel_size is the size of the box we pass over each img to extract the features, exactly like tf (3,3,3)
        """
        #Backbone (feature extractor)
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # RGB input
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # downsample by 2 -> 112x1112

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # downsample by 2 -> 56x56

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # downsample by 2 -> 28x28
        )

        # Detection head
        # Predict bounding boxes + confidence
        # Output channels = num_anchors * 5 (x, y, w, h, conf)
        self.det_head = nn.Conv2d(256, num_anchors * 5, kernel_size=1)

    def forward(self, x):
        """
        x: [batch_size, 3, H, W]
        Returns:
            out: [batch_size, num_anchors * 5, H/4, W/4] 
                 Each cell predicts (x, y, w, h, confidence)
        """
        features = self.backbone(x)
        out = self.det_head(features)  # [B, 5*num_anchors, H', W']

        B, C, H, W = out.shape
        out = out.view(B, -1, 5, H, W)  # [B, num_anchors, 5, H', W']
        return out

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
import os

#Load model and weights
model = FaceDetectionNet()
checkpoint_path = os.path.join("saved_checkpoints", "faceNet_checkpoint100_multi.pth")
state = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(state["model_state"])
model.to(device)
model.eval()                                      # important: inference mode

FaceDetectionNet(
  (backbone): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dila

In [4]:
def preprocess(frame, size=(224, 224)):
    # OpenCV frame is BGR uint8 (H, W, 3)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    resized = cv2.resize(rgb, size)               # (H, W, 3)
    x = resized.astype(np.float32) / 255.0        # scale 0..1

    # HWC -> CHW
    x = np.transpose(x, (2, 0, 1))                # (3, H, W)

    # Add batch dimension: (3,H,W)->(1,3,H,W)
    x = np.expand_dims(x, axis=0)

    # NumPy -> Torch tensor
    x = torch.from_numpy(x).to(device)            # float32 tensor on device
    return x

In [5]:
def get_boxes(image):
    #load image w/ preprocessing
    
    orig_w, orig_h = image.size  # save original size

    image_tensor = TRANSFORM(image)   # same TRANSFORM as training
    image_tensor = image_tensor.unsqueeze(0).to(device)  # [1, 3, 224, 224]

    #forward pass w/ no gradient
    with torch.no_grad():
        outputs = model(image_tensor)

    #decode the predicitions into boxes and scores
    pred = outputs[0]  # [1, 5, 28, 28]

    # Flatten
    pred = pred.view(5, -1).permute(1, 0)  # [28*28, 5]

    pred_boxes = pred[:, :4] * 224.0   # undo normalization
    pred_scores = torch.sigmoid(pred[:, 4])

    #apply confidence threshhold and nms (non max suppresion)
    CONF_THRESH = 0.5
    NMS_THRESH = 0.4

    keep = pred_scores > CONF_THRESH
    pred_boxes = pred_boxes[keep]
    pred_scores = pred_scores[keep]

    if pred_boxes.shape[0] > 0:
        keep_idx = box_nms(pred_boxes, pred_scores, iou_threshold=NMS_THRESH)
        pred_boxes = pred_boxes[keep_idx]
        pred_scores = pred_scores[keep_idx]


    #scale boxes back into original image
    scale_x = orig_w / 224
    scale_y = orig_h / 224

    pred_boxes[:, [0, 2]] *= scale_x
    pred_boxes[:, [1, 3]] *= scale_y

    return pred_boxes, pred_scores

In [7]:
def predict_boxes(frame):
    x = preprocess(frame)

    # 2) Run the model
    output = model(x)

    # 3) Convert output to boxes in original frame coordinates
    # This depends on your model. Placeholder format:
    # boxes = [(x1, y1, x2, y2, score), ...]
    boxes = decode_output_to_boxes(output, frame.shape)

    return boxes

In [27]:
from PIL import Image

def main():
    cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
    if not cap.isOpened():
        raise RuntimeError("Could not open webcam")

    frame_skip = 5
    frame_count = 0

    cached_boxes = None
    cached_scores = None

    no_face_frames = 0
    cache_clear_interval = 5
    max_no_face_frames = 30

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        if frame_count % frame_skip == 0:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_img = Image.fromarray(rgb)
            pred_boxes, pred_scores = get_boxes(pil_img)

            if pred_boxes is None or len(pred_boxes) > 0:
                cached_boxes = pred_boxes
                cached_scores = pred_scores
                no_face_frames = 0
            else:
                no_face_frames += 1
        
        if no_face_frames >= cache_clear_interval:
            cached_boxes = None
            cached_scores = None

        if no_face_frames >= max_no_face_frames:
            print("No faces detected for a while, Exiting.")
            break

        if cached_boxes is not None:
            for box, score in zip(cached_boxes, cached_scores):
                x1, y1, x2, y2 = box.cpu()
                cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
                cv2.putText(frame, f"Confidence: {score:.2f}", (int(x1), int(y1) - 5),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)

        cv2.imshow("Face Detector", frame)
        if (cv2.waitKey(1) & 0xFF) == ord("q"):
            break

        if cv2.getWindowProperty("Face Detector", cv2.WND_PROP_VISIBLE) < 1:
            break
    
    cap.release()
    cv2.destroyAllWindows()
    


In [28]:
if __name__ == "__main__":
    main()