In [1]:
import cv2
import torch
from ultralytics import YOLO
import mediapipe as mp
import numpy as np
from collections import deque

In [2]:
print(f"OpenCV: {cv2.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MediaPipe: {mp.__version__}")
print(f"NumPy: {np.__version__}")

OpenCV: 4.13.0
PyTorch: 2.2.2
CUDA available: True
MediaPipe: 0.10.9
NumPy: 1.26.4


In [3]:
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,              
    refine_landmarks=True,        
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

In [4]:
# cap = cv2.VideoCapture(0)
# print("Press 'q' to quit")

# while True:
#     ret, frame = cap.read()
#     if not ret:
#         break

#     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

#     results = face_mesh.process(rgb_frame)
    
#     # Draw landmarks if face detected
#     if results.multi_face_landmarks:
#         for face_landmarks in results.multi_face_landmarks:
#             mp_drawing.draw_landmarks(
#                 image=frame,
#                 landmark_list=face_landmarks,
#                 connections=mp_face_mesh.FACEMESH_TESSELATION,
#                 landmark_drawing_spec=None,
#                 connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
#             )
        
#         # Display status
#         cv2.putText(frame, "Face Detected", (10, 30),
#                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
#     else:
#         cv2.putText(frame, "No Face", (10, 30),
#                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
#     cv2.imshow('MediaPipe FaceMesh', frame)
    
#     if cv2.waitKey(1) & 0xFF == ord('q'):
#         break

# cap.release()
# cv2.destroyAllWindows()
# face_mesh.close()

In [4]:
model = YOLO('yolov8n.pt')

In [5]:
LEFT_EYE = [33, 160, 158, 133, 153, 144]
RIGHT_EYE = [362, 385, 387, 263, 373, 380]

def calculate_distance(point1, point2):
    return np.sqrt((point1.x - point2.x)**2 + (point1.y - point2.y)**2)

def calculate_ear(eye_landmarks):
    vertical1 = calculate_distance(eye_landmarks[1], eye_landmarks[5])
    vertical2 = calculate_distance(eye_landmarks[2], eye_landmarks[4])
    horizontal = calculate_distance(eye_landmarks[0], eye_landmarks[3])
    ear = (vertical1 + vertical2) / (2.0 * horizontal)
    return ear

def get_eye_landmarks(face_landmarks, eye_indices):
    return [face_landmarks.landmark[i] for i in eye_indices]

# Initialize
model = YOLO('yolov8n.pt')
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

cap = cv2.VideoCapture(0)
EAR_THRESHOLD = 0.2

print("Press 'q' to quit")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # YOLO Detection
    yolo_results = model(frame)
    phone_detected = False
    for box in yolo_results[0].boxes:
        if int(box.cls[0]) == 67 and float(box.conf[0]) > 0.5:
            phone_detected = True
            break
    
    # MediaPipe Face Detection
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    face_results = face_mesh.process(rgb_frame)
    face_detected = face_results.multi_face_landmarks is not None
    
    # Draw YOLO
    annotated_frame = yolo_results[0].plot()
    
    # Draw face mesh and calculate EAR
    if face_detected:
        face_landmarks = face_results.multi_face_landmarks[0]
        
        # Draw mesh
        mp_drawing.draw_landmarks(
            image=annotated_frame,
            landmark_list=face_landmarks,
            connections=mp_face_mesh.FACEMESH_TESSELATION,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
        )
        
        # Calculate EAR
        left_eye = get_eye_landmarks(face_landmarks, LEFT_EYE)
        right_eye = get_eye_landmarks(face_landmarks, RIGHT_EYE)
        left_ear = calculate_ear(left_eye)
        right_ear = calculate_ear(right_eye)
        avg_ear = (left_ear + right_ear) / 2.0
        
        eyes_closed = avg_ear < EAR_THRESHOLD
        
        # Display EAR
        ear_text = f"EAR: {avg_ear:.3f}"
        ear_color = (0, 0, 255) if eyes_closed else (0, 255, 0)
        cv2.putText(annotated_frame, ear_text, (10, 100),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, ear_color, 2)
        
        blink_text = "EYES CLOSED" if eyes_closed else "EYES OPEN"
        cv2.putText(annotated_frame, blink_text, (10, 135),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, ear_color, 2)
    
    # Display status
    phone_color = (0, 0, 255) if phone_detected else (0, 255, 0)
    phone_text = "PHONE: YES" if phone_detected else "PHONE: NO"
    cv2.putText(annotated_frame, phone_text, (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, phone_color, 2)
    
    face_color = (0, 255, 0) if face_detected else (0, 0, 255)
    face_text = "FACE: YES" if face_detected else "FACE: NO"
    cv2.putText(annotated_frame, face_text, (10, 65),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, face_color, 2)
    
    cv2.imshow('Attention Detection', annotated_frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
face_mesh.close()

Press 'q' to quit

0: 480x640 1 person, 343.0ms
Speed: 32.7ms preprocess, 343.0ms inference, 37.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 19.1ms
Speed: 4.1ms preprocess, 19.1ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 21.5ms
Speed: 1.8ms preprocess, 21.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 16.8ms
Speed: 2.5ms preprocess, 16.8ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 24.8ms
Speed: 2.1ms preprocess, 24.8ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 22.6ms
Speed: 1.8ms preprocess, 22.6ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 23.4ms
Speed: 2.0ms preprocess, 23.4ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 19.6ms
Speed: 2.1ms prepr