In [63]:
from ultralytics import YOLO
import cv2
import os
from facenet_pytorch import MTCNN,InceptionResnetV1
from torchvision import transforms
from PIL import Image
import torch
import albumentations

In [23]:
model = YOLO("yolov11n-face.pt")

In [30]:
import ssl
import urllib.request

ssl._create_default_https_context = ssl._create_unverified_context

In [35]:
model_facenet = InceptionResnetV1(pretrained='vggface2').eval().to('cpu')
mtcnn = MTCNN(image_size=160, margin=0, device='cpu')

In [38]:
def get_face_embedding_from_file(image_path):
    if not os.path.exists(image_path):
        print(f"❌ File not found: {image_path}")
        return None

    img = Image.open(image_path).convert('RGB')
    face = mtcnn(img)

    if face is None:
        print(f"❗ No face detected in {image_path}")
        return None

    face = face.unsqueeze(0).to('cpu')  # Bx3x160x160
    with torch.no_grad():
        embedding = model_facenet(face)  # Bx512
    return embedding

def compare_embeddings(emb1, emb2):
    sim = torch.nn.functional.cosine_similarity(emb1, emb2).item()
    return sim

In [57]:
dict_emb_avar = {'bob': get_face_embedding_from_file('../resources/bob_avatar.png'),
                 'sarah': get_face_embedding_from_file('../resources/sarah_avatar.png')}

In [49]:
emb = get_face_embedding_from_file('../resources/sarah_img.png')

In [52]:
for name, emb_item in dict_emb_avar.items():
    if emb_item is not None and emb is not None:
        similarity = compare_embeddings(emb, emb_item)
        if similarity > 0.4:
            print(f"✅ Likely same person:{name}")
            break
    else:
        print("Không thể so sánh: Một trong hai ảnh không có khuôn mặt.")

✅ Likely same person:sarah


In [66]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((160, 160)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])  # Normalize to [-1, 1]
])

In [60]:
def recognize_face(embedding):
    threshold = 0.4  # chỉnh tùy độ nhạy

    for name_item, emb_element in dict_emb_avar.items():
        if emb_element is not None and embedding is not None:
            sim = compare_embeddings(embedding, emb_element)
            if sim > threshold:
                return name_item
    return ''

In [61]:
def display_objects(video_path, output_video_path):
    cap = cv2.VideoCapture(video_path)
    assert cap.isOpened(), "Error reading video file"
    w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
    out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))
    crop_dir = "../resources/crops_face"
    os.makedirs(crop_dir, exist_ok=True)
    cropped_ids = set()
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        results = model.track(frame, persist=True, tracker="bytetrack.yaml")[0]

        for box in results.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            cls = int(box.cls[0])
            label = model.names[cls]
            conf = box.conf[0]

            if hasattr(box, 'id') and box.id is not None:
                track_id = int(box.id[0])
            else:
                continue

            obj_crop = frame[y1:y2, x1:x2]
            face_rgb = cv2.cvtColor(obj_crop, cv2.COLOR_BGR2RGB)
            img_tensor = transform(face_rgb).unsqueeze(0).to('cpu')
            name_person = ''
            with torch.no_grad():
                embedding = model_facenet(img_tensor)
                name_person = recognize_face(embedding)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f'{name_person} {conf:.2f}', (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)

        out.write(frame)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [62]:
display_objects('../resources/queue_video.mp4' , '../resources/queue_video_face_output.mp4')


0: 384x640 2 faces, 34.4ms
Speed: 2.0ms preprocess, 34.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 33.3ms
Speed: 1.3ms preprocess, 33.3ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 32.2ms
Speed: 1.1ms preprocess, 32.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 33.5ms
Speed: 1.1ms preprocess, 33.5ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 33.1ms
Speed: 1.2ms preprocess, 33.1ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 34.8ms
Speed: 1.1ms preprocess, 34.8ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 34.4ms
Speed: 1.1ms preprocess, 34.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 faces, 33.6ms
Speed: 1.2ms preprocess, 33.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)


In [68]:
transform_augmentation = albumentations.Compose([
    albumentations.HorizontalFlip(p=0.5),
    albumentations.Rotate(limit=15, p=0.7),
])

In [69]:
image_origin = cv2.imread('../resources/bob_avatar.png')
augmented = transform_augmentation(image=image_origin)
augmented_image = augmented['image']
cv2.imwrite('../resources/bob_face_augmented.jpg', augmented_image)

True