In [7]:
import cv2
import torch
from ultralytics import YOLO
import mediapipe as mp
import numpy as np
from collections import deque

In [8]:
print(f"OpenCV: {cv2.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MediaPipe: {mp.__version__}")
print(f"NumPy: {np.__version__}")

OpenCV: 4.13.0
PyTorch: 2.2.2
CUDA available: True
MediaPipe: 0.10.9
NumPy: 1.26.4


In [9]:
# cap = cv2.VideoCapture(0)
# print("Press 'q' to quit")

# while True:
#     ret, frame = cap.read()
#     if not ret:
#         break

#     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

#     results = face_mesh.process(rgb_frame)
    
#     # Draw landmarks if face detected
#     if results.multi_face_landmarks:
#         for face_landmarks in results.multi_face_landmarks:
#             mp_drawing.draw_landmarks(
#                 image=frame,
#                 landmark_list=face_landmarks,
#                 connections=mp_face_mesh.FACEMESH_TESSELATION,
#                 landmark_drawing_spec=None,
#                 connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
#             )
        
#         # Display status
#         cv2.putText(frame, "Face Detected", (10, 30),
#                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
#     else:
#         cv2.putText(frame, "No Face", (10, 30),
#                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
#     cv2.imshow('MediaPipe FaceMesh', frame)
    
#     if cv2.waitKey(1) & 0xFF == ord('q'):
#         break

# cap.release()
# cv2.destroyAllWindows()
# face_mesh.close()

In [10]:
# Eye landmark indices
LEFT_EYE = [33, 160, 158, 133, 153, 144]
RIGHT_EYE = [362, 385, 387, 263, 373, 380]

# 3D model points for head pose
MODEL_POINTS = np.array([
    (0.0, 0.0, 0.0),
    (0.0, -330.0, -65.0),
    (-225.0, 170.0, -135.0),
    (225.0, 170.0, -135.0),
    (-150.0, -150.0, -125.0),
    (150.0, -150.0, -125.0)
], dtype=np.float64)

# Thresholds
EAR_THRESHOLD = 0.25
YAW_THRESHOLD = 32

In [11]:
def calculate_distance(point1, point2):
    return np.sqrt((point1.x - point2.x)**2 + (point1.y - point2.y)**2)

def calculate_ear(eye_landmarks):
    vertical1 = calculate_distance(eye_landmarks[1], eye_landmarks[5])
    vertical2 = calculate_distance(eye_landmarks[2], eye_landmarks[4])
    horizontal = calculate_distance(eye_landmarks[0], eye_landmarks[3])
    ear = (vertical1 + vertical2) / (2.0 * horizontal)
    return ear

def get_eye_landmarks(face_landmarks, eye_indices):
    return [face_landmarks.landmark[i] for i in eye_indices]

In [12]:
def get_2d_points(face_landmarks, img_width, img_height):
    indices = [1, 152, 33, 263, 61, 291]
    points_2d = []
    for idx in indices:
        landmark = face_landmarks.landmark[idx]
        x = int(landmark.x * img_width)
        y = int(landmark.y * img_height)
        points_2d.append([x, y])
    return np.array(points_2d, dtype=np.float64)

def get_head_pose(face_landmarks, img_width, img_height):
    image_points = get_2d_points(face_landmarks, img_width, img_height)
    
    focal_length = img_width
    center = (img_width / 2, img_height / 2)
    camera_matrix = np.array([
        [focal_length, 0, center[0]],
        [0, focal_length, center[1]],
        [0, 0, 1]
    ], dtype=np.float64)
    
    dist_coeffs = np.zeros((4, 1))
    
    success, rotation_vector, translation_vector = cv2.solvePnP(
        MODEL_POINTS,
        image_points,
        camera_matrix,
        dist_coeffs,
        flags=cv2.SOLVEPNP_ITERATIVE
    )
    
    rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
    pose_matrix = cv2.hconcat((rotation_matrix, translation_vector))
    _, _, _, _, _, _, euler_angles = cv2.decomposeProjectionMatrix(pose_matrix)
    
    pitch = euler_angles[0][0]
    yaw = euler_angles[1][0]
    roll = euler_angles[2][0]
    
    return pitch, yaw, roll

In [13]:
# Initialize YOLO
model = YOLO('yolo12m.pt')

# Initialize MediaPipe
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

print("Models initialized!")

Models initialized!


In [14]:
cap = cv2.VideoCapture(0)
print("Press 'q' to quit")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    img_height, img_width = frame.shape[:2]
    
    # YOLO Detection
    yolo_results = model(frame)
    phone_detected = False
    for box in yolo_results[0].boxes:
        if int(box.cls[0]) == 67 and float(box.conf[0]) > 0.4:
            phone_detected = True
            break
    
    # MediaPipe Face Detection
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    face_results = face_mesh.process(rgb_frame)
    face_detected = face_results.multi_face_landmarks is not None
    
    # Draw YOLO
    annotated_frame = yolo_results[0].plot()
    
    # Process face
    if face_detected:
        face_landmarks = face_results.multi_face_landmarks[0]
        
        # Draw mesh
        mp_drawing.draw_landmarks(
            image=annotated_frame,
            landmark_list=face_landmarks,
            connections=mp_face_mesh.FACEMESH_TESSELATION,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
        )
        
        # Calculate EAR
        left_eye = get_eye_landmarks(face_landmarks, LEFT_EYE)
        right_eye = get_eye_landmarks(face_landmarks, RIGHT_EYE)
        left_ear = calculate_ear(left_eye)
        right_ear = calculate_ear(right_eye)
        avg_ear = (left_ear + right_ear) / 2.0
        eyes_closed = avg_ear < EAR_THRESHOLD
        
        # Calculate head pose
        pitch, yaw, roll = get_head_pose(face_landmarks, img_width, img_height)
        looking_away = abs(yaw) > YAW_THRESHOLD
        
        # Display EAR
        ear_text = f"EAR: {avg_ear:.3f}"
        ear_color = (0, 0, 255) if eyes_closed else (0, 255, 0)
        cv2.putText(annotated_frame, ear_text, (10, 100),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, ear_color, 2)
        
        blink_text = "EYES CLOSED" if eyes_closed else "EYES OPEN"
        cv2.putText(annotated_frame, blink_text, (10, 135),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, ear_color, 2)
        
        # Display head pose
        pose_text = f"Yaw: {yaw:.1f}deg Pitch: {pitch:.1f}deg"
        cv2.putText(annotated_frame, pose_text, (10, 170),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
        
        gaze_color = (0, 0, 255) if looking_away else (0, 255, 0)
        gaze_text = "LOOKING AWAY" if looking_away else "LOOKING FORWARD"
        cv2.putText(annotated_frame, gaze_text, (10, 205),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, gaze_color, 2)
    
    # Display status
    phone_color = (0, 0, 255) if phone_detected else (0, 255, 0)
    phone_text = "PHONE: YES" if phone_detected else "PHONE: NO"
    cv2.putText(annotated_frame, phone_text, (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, phone_color, 2)
    
    face_color = (0, 255, 0) if face_detected else (0, 0, 255)
    face_text = "FACE: YES" if face_detected else "FACE: NO"
    cv2.putText(annotated_frame, face_text, (10, 65),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, face_color, 2)
    
    cv2.imshow('Attention Detection', annotated_frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
face_mesh.close()

Press 'q' to quit

0: 480x640 1 person, 45.7ms
Speed: 5.4ms preprocess, 45.7ms inference, 3.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 28.4ms
Speed: 2.7ms preprocess, 28.4ms inference, 5.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 34.5ms
Speed: 2.5ms preprocess, 34.5ms inference, 3.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 29.8ms
Speed: 1.4ms preprocess, 29.8ms inference, 3.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 30.1ms
Speed: 2.6ms preprocess, 30.1ms inference, 3.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 29.9ms
Speed: 1.7ms preprocess, 29.9ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 27.4ms
Speed: 2.5ms preprocess, 27.4ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 30.2ms
Speed: 2.1ms preprocess, 30.2ms inference, 3.5ms p