#### Determining the landmarks

In [None]:
import cv2
import mediapipe as mp
import numpy as np
from ultralytics import YOLO

In [4]:
def calculate_angle(a, b, c):
    """Calculate 2 vector a-b and c-b"""
    vector1 = np.array(a) - np.array(b)
    vector2 = np.array(c) - np.array(b)

    unit_vector1 = vector1 / np.linalg.norm(vector1)
    unit_vector2 = vector2 / np.linalg.norm(vector2)

    dot_product = np.dot(unit_vector1, unit_vector2)
    angle = np.arccos(dot_product)
    return np.degrees(angle)

def get_point(landmark):
    """Lấy tọa độ (x, y) của landmark."""
    return int(landmarks[landmark].x * frame.shape[1]), int(landmarks[landmark].y * frame.shape[0])

In [5]:
# Mediapipe Pose & Hands
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

pose = mp_pose.Pose()
hands = mp_hands.Hands()

# Model YOLO
model = YOLO("yolo-Weights/yolov8n.pt")

# Target classes: Currently detecting "person" and "cell phone"
target_classes = {"person", "cell phone"}

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert color stream
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    pose_results = pose.process(image)
    hand_results = hands.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    # Yolo Detection
    yolo_results = model(frame, stream=True)
    detected_objects = {"cell phone": []}

    for r in yolo_results:
        for box in r.boxes:
            cls_name = model.names[int(box.cls[0])]
            if cls_name in target_classes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                detected_objects.setdefault(cls_name, []).append((x1, y1, x2, y2))
                cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 255), 3)
                cv2.putText(frame, cls_name, (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Pose Detection
    if pose_results.pose_landmarks:
        landmarks = pose_results.pose_landmarks.landmark

        # Extract shoulder, elbow, wrist landmarks
        shoulder_r, elbow_r, wrist_r = get_point(mp_pose.PoseLandmark.RIGHT_SHOULDER), \
                                       get_point(mp_pose.PoseLandmark.RIGHT_ELBOW), \
                                       get_point(mp_pose.PoseLandmark.RIGHT_WRIST)

        shoulder_l, elbow_l, wrist_l = get_point(mp_pose.PoseLandmark.LEFT_SHOULDER), \
                                       get_point(mp_pose.PoseLandmark.LEFT_ELBOW), \
                                       get_point(mp_pose.PoseLandmark.LEFT_WRIST)

        # Check if hand is up
        angle_right = calculate_angle(shoulder_r, elbow_r, wrist_r)
        angle_left = calculate_angle(shoulder_l, elbow_l, wrist_l)

        hand_up = ((angle_right < 120 and wrist_r[1] < elbow_r[1]) or
                   (angle_left < 120 and wrist_l[1] < elbow_l[1]))

        #Checking if holding phone
        holding_phone = False
        if hand_results.multi_hand_landmarks:
            for hand_landmarks in hand_results.multi_hand_landmarks:
                for i, landmark in enumerate(hand_landmarks.landmark):
                    x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                    # Check if hand is holding phone
                    for phone_x1, phone_y1, phone_x2, phone_y2 in detected_objects["cell phone"]:
                        if phone_x1 <= x <= phone_x2 and phone_y1 <= y <= phone_y2:
                            holding_phone = True
                            break
                    if holding_phone:
                        break

        # Display result
        if hand_up:
            text, color = ("Hand up", (0, 255, 0)) if not holding_phone else ("Holding phone", (0, 0, 255))
            cv2.putText(frame, text, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)

        # Pose landmarks
        mp_drawing.draw_landmarks(frame, pose_results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

    # Draw hand landmarks
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.imshow("Pose Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1741948270.661878   47104 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1741948270.663738   47854 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) Arc(tm) Graphics (MTL)
I0000 00:00:1741948270.667775   47104 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1741948270.669941   47879 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1ubuntu1~24.04.1), renderer: Mesa Intel(R) Arc(tm) Graphics (MTL)
W0000 00:00:1741948270.691567   47856 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741948270.703487   47862 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741948270.707098   47834 inference_feedback_manager.cc:114] Feedback mana


0: 480x640 1 person, 9.5ms
Speed: 3.3ms preprocess, 9.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 remote, 1 sink, 6.2ms
Speed: 1.7ms preprocess, 6.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.2ms
Speed: 1.6ms preprocess, 7.2ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 4.7ms
Speed: 1.5ms preprocess, 4.7ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 3.8ms
Speed: 1.1ms preprocess, 3.8ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 tv, 3.3ms
Speed: 1.3ms preprocess, 3.3ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 3.5ms
Speed: 1.0ms preprocess, 3.5ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 3.7ms
Speed: 1.0ms preprocess, 3.7ms inference, 0.9ms postprocess per image at shape (