In [1]:
import cv2
from PIL import Image
import torch
from ultralytics import YOLO
from utils.system import get_available_device
from facenet_pytorch import InceptionResnetV1
from collections import defaultdict
import torch.nn.functional as F
import os


device = get_available_device()

KNOWN_FACES_PATH='./known_faces'

# Model
detector_model = YOLO("./best_models/yolo_decoder_best.pt").to(device)
classifier_model = InceptionResnetV1(pretrained='vggface2', num_classes=2, device=device).eval()

  from .autonotebook import tqdm as notebook_tqdm


GPU is not available, using CPU instead
Using device: cpu


In [2]:
known_embeddings = defaultdict(list)

In [3]:
from torchvision.transforms import v2

transform = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.uint8, scale=True),
    v2.Resize((160, 160)),  # Измените размер под вашу модель
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
])


def compare_embeddings(embedding, threshold=0.7):
    best_match = None
    min_distance = float('inf')

    for name, known_person_embeddings in known_embeddings.items():
        for embeddings in known_person_embeddings:
            distance = F.pairwise_distance(embedding, embeddings)

            if distance < min_distance:
                min_distance = distance
                best_match = name

    if min_distance <= threshold:
        return best_match
    else:
        return None
    

def detect_faces(image):
    with torch.no_grad():  # Отключите вычисление градиентов во время инференса
        result = detector_model([image])[0]
    
    return result.boxes.xyxy
    
        
def recognize_faces(image, boxes):
    cropped_faces = [
        transform(image.crop(list(map(float,bbox))))
        for bbox in boxes
    ]
    if cropped_faces:  # Убедитесь, что список не пустой
        face_tensors = torch.stack(cropped_faces)
        embeddings = classifier_model(face_tensors)
    else:
        embeddings = torch.empty(0)
        
    return embeddings

In [4]:
from utils.system import get_last_dirname


for path_dir, dir_list, file_list in os.walk(KNOWN_FACES_PATH):
    for file_path in file_list:
        image_path = os.path.join(path_dir, file_path)
        image = Image.open(image_path)
        
        boxes = detect_faces(image)
        embeddings = recognize_faces(image, boxes)
        
        name = get_last_dirname(path_dir)
        known_embeddings[name].append(embeddings[0])


0: 448x640 1 face, 75.7ms
Speed: 5.3ms preprocess, 75.7ms inference, 1.3ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 62.7ms
Speed: 3.4ms preprocess, 62.7ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 60.9ms
Speed: 2.5ms preprocess, 60.9ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 60.4ms
Speed: 2.8ms preprocess, 60.4ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 63.2ms
Speed: 2.9ms preprocess, 63.2ms inference, 0.9ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 97.2ms
Speed: 3.9ms preprocess, 97.2ms inference, 1.4ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 75.4ms
Speed: 4.6ms preprocess, 75.4ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 face, 69.9ms
Speed: 5.1ms preprocess, 69.9ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)


In [None]:
import cv2
import time


# 2.  Захват видео с веб-камеры
cap = cv2.VideoCapture(0)  # 0 - индекс веб-камеры по умолчанию. Измените, если у вас несколько камер.


# Проверка, успешно ли открыта веб-камера
if not cap.isOpened():
    raise IOError("Не удается открыть веб-камеру")


frame_count = 0
start_time = time.time()
fps = 0

# 3. Основной цикл обработки кадров
while(True):
    # Считываем кадр с веб-камеры
    ret, frame = cap.read()

    # Если кадр не был успешно считан, выходим из цикла
    if not ret:
        break
    
    frame_count += 1
    elapsed_time = time.time() - start_time
    
    
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    
    boxes = detect_faces(pil_image)
    embeddings = recognize_faces(pil_image, boxes)
    
    for bbox, embedding in zip(boxes, embeddings):
        x1 = int(bbox[0])
        y1 = int(bbox[1])
        x2 = int(bbox[2])
        y2 = int(bbox[3])
        
        name = compare_embeddings(embedding) or 'Unknown'

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"{name}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    

    if elapsed_time > 1:  # Обновляем FPS каждую секунду
        fps = frame_count / elapsed_time
        start_time = time.time()
        frame_count = 0

    # Добавим FPS на кадр для отображения на экране
    cv2.putText(frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    # 7. Отображение результата
    cv2.imshow('Обнаружение лиц', frame)

    # 8. Выход из цикла при нажатии клавиши 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    

# 9. Освобождение ресурсов
cap.release()
cv2.destroyAllWindows()




0: 384x640 1 face, 71.3ms
Speed: 4.4ms preprocess, 71.3ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 60.9ms
Speed: 2.4ms preprocess, 60.9ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 68.1ms
Speed: 2.9ms preprocess, 68.1ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 53.4ms
Speed: 2.2ms preprocess, 53.4ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 49.5ms
Speed: 1.8ms preprocess, 49.5ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 73.2ms
Speed: 2.2ms preprocess, 73.2ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 59.3ms
Speed: 3.1ms preprocess, 59.3ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 76.3ms
Speed: 1.8ms preprocess, 76.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x

KeyboardInterrupt: 

: 