In [15]:
import cv2
import numpy as np
import dlib
import os
import json

In [16]:
def load_yolo_model(weights_path, config_path):
    net = cv2.dnn.readNet(weights_path, config_path)
    layer_names = net.getLayerNames()
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
    return net, output_layers

In [17]:
def detect_faces(img, net, output_layers, confidence_threshold=0.5):
    height, width, channels = img.shape

    # Prepare the image for the model
    blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    boxes = []
    confidences = []

    # Processing the output
    for out in outs:
        for detection in out:
            scores = detection[5:]
            confidence = max(scores)
            if confidence > confidence_threshold:  # Confidence threshold
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    faces = []
    selected_confidences = []
    for i in indexes.flatten():
        x, y, w, h = boxes[i]
        face = img[y:y+h, x:x+w]
        faces.append((x, y, w, h))
        selected_confidences.append(confidences[i])  # Add the confidence for this face
        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

    return img, faces, selected_confidences

In [18]:
def extract_face_characteristics(img, faces, shape_predictor, face_rec_model):
    characteristics = []
    for (x, y, w, h) in faces:
        face = img[y:y+h, x:x+w]
        rgb_face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)  # Convert to RGB
        
        # Predict face landmarks
        shape = shape_predictor(rgb_face, dlib.rectangle(0, 0, face.shape[1], face.shape[0]))

        # Draw landmarks and bounding boxes around them
        for p in shape.parts():
            cv2.circle(face, (p.x, p.y), 2, (0, 0, 255), -1)  # Red dots for landmarks
            
            # Draw bounding box around each landmark
            landmark_bbox_size = 4  # Size of the bounding box around each landmark
            cv2.rectangle(face, (p.x - landmark_bbox_size, p.y - landmark_bbox_size), 
                          (p.x + landmark_bbox_size, p.y + landmark_bbox_size), (255, 0, 0), 1)  # Blue rectangle around landmarks
        
        # Get the face embedding
        face_descriptor = face_rec_model.compute_face_descriptor(rgb_face, shape)
        
        characteristics.append({
            'bbox': (x, y, w, h),
            'landmarks': [(p.x, p.y) for p in shape.parts()],
            'embedding': np.array(face_descriptor)
        })

    return characteristics

In [19]:
def process_video(video_path, output_video_path, net, output_layers, shape_predictor, face_rec_model):
    cap = cv2.VideoCapture(video_path)
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = 0
    result_data = []

    # Prepare video writer
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, 
                          (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_rate == 0:  # Process 1 frame per second
            img_with_faces, faces, confidences = detect_faces(frame, net, output_layers, confidence_threshold=0.5)
            characteristics = extract_face_characteristics(frame, faces, shape_predictor, face_rec_model)

            # Store the results
            frame_data = {
                'frame_id': frame_count,
                'face_count': len(faces),
                'faces': []
            }

            for idx, face_data in enumerate(characteristics):
                face_info = {
                    'face_id': idx,
                    'bbox': face_data['bbox'],
                    'confidence': confidences[idx],  # Add confidence score
                    'landmarks': face_data['landmarks'],
                    'embedding': face_data['embedding'].tolist()  # Convert to list for JSON serialization
                }
                frame_data['faces'].append(face_info)

            result_data.append(frame_data)
            
            # Print the JSON result of the current frame
            print(json.dumps(frame_data, indent=4))
            
            # Write the frame with detected faces to the output video
            out.write(img_with_faces)

        frame_count += 1

    cap.release()
    out.release()

    return result_data

In [20]:
def main():
    weights_path = "yolov3-wider_16000.weights"
    config_path = "yolov3-face.cfg"
    video_path = "crowd.mp4"
    output_video_path = "output_video_with_faces.avi"
    shape_predictor_path = "shape_predictor_68_face_landmarks.dat"
    face_rec_model_path = "dlib_face_recognition_resnet_model_v1.dat"

    net, output_layers = load_yolo_model(weights_path, config_path)
    shape_predictor = dlib.shape_predictor(shape_predictor_path)
    face_rec_model = dlib.face_recognition_model_v1(face_rec_model_path)

    result_data = process_video(video_path, output_video_path, net, output_layers, shape_predictor, face_rec_model)

    # Optionally, save the result data to a JSON file or print it
    with open('result_data.json', 'w') as f:
        json.dump(result_data, f, indent=4)

if __name__ == "__main__":
    main()

{
    "frame_id": 0,
    "face_count": 32,
    "faces": [
        {
            "face_id": 0,
            "bbox": [
                897,
                843,
                59,
                64
            ],
            "confidence": 0.9976189732551575,
            "landmarks": [
                [
                    1,
                    25
                ],
                [
                    1,
                    32
                ],
                [
                    1,
                    38
                ],
                [
                    1,
                    44
                ],
                [
                    2,
                    51
                ],
                [
                    5,
                    57
                ],
                [
                    10,
                    60
                ],
                [
                    16,
                    63
                ],
                [
                    22,
       

KeyboardInterrupt: 