In [29]:
from ultralytics import YOLO
import cv2
import numpy as np

In [2]:
model = YOLO("yolov8n-pose.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-pose.pt to 'yolov8n-pose.pt'...


100%|██████████| 6.52M/6.52M [00:00<00:00, 9.94MB/s]


In [89]:
skeleton_colors = {
    (5, 7): (255, 0, 0),    # Left arm
    (7, 9): (255, 100, 0),
    (6, 8): (0, 255, 0),    # Right arm
    (8, 10): (0, 255, 100),
    (5, 6): (255, 255, 0),  # Shoulder
    (5, 11): (100, 100, 255),
    (6, 12): (100, 255, 255),
    (11, 13): (255, 0, 255),  # Left leg
    (13, 15): (200, 0, 200),
    (12, 14): (0, 100, 255),  # Right leg
    (14, 16): (0, 0, 255),
    (11, 12): (150, 150, 150),  # Hip
}

In [145]:
def calculate_angle(a, b, c):
    a = np.array(a)
    b = np.array(b)
    c = np.array(c)

    radians = np.arctan2(c[1] - b[1], c[0] - b[0]) - \
              np.arctan2(a[1] - b[1], a[0] - b[0])
    angle = np.abs(np.degrees(radians))

    if angle > 180.0:
        angle = 360 - angle
    return angle

In [151]:
def get_arm_angle(keypoints):
    # left_shoulder = keypoints[5]
    # left_elbow = keypoints[7]
    # left_wrist = keypoints[9]

    right_shoulder = keypoints[6]
    right_elbow = keypoints[8]
    right_wrist = keypoints[10]

    # left_arm_angle = calculate_angle(left_shoulder, left_elbow, left_wrist)
    right_arm_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)


    return right_arm_angle



In [168]:
def check_standing_by_y_diff(keypoints, y_threshold=20):
    """
    Checks if a person is standing based on the Y-coordinate difference between shoulders and hips.
    Args:
        keypoints (np.array): Array of keypoint coordinates.
        y_threshold (int): Minimum Y-difference for standing.

    Returns:
        str: "Standing" or "Lying".
    """
    if keypoints is None:
        return "Unknown"

    left_shoulder_y = keypoints[6][1]
    right_shoulder_y = keypoints[7][1]
    left_hip_y = keypoints[12][1]
    right_hip_y = keypoints[13][1]

    avg_shoulder_y = (left_shoulder_y + right_shoulder_y) / 2
    avg_hip_y = (left_hip_y + right_hip_y) / 2

    # In image coordinates, Y increases downwards.
    # So for standing, hip Y should be greater than shoulder Y.
    y_diff = avg_hip_y - avg_shoulder_y

    if y_diff > y_threshold:
        return "Standing"
    else:
        return "Lying"

In [171]:
keypoint_names = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
    "left_wrist", "right_wrist", "left_hip", "right_hip",
    "left_knee", "right_knee", "left_ankle", "right_ankle"
]

skeleton = [
    (5, 7), (7, 9),     # Left arm
    (6, 8), (8, 10),    # Right arm
    (5, 6),             # Shoulders
    (5, 11), (6, 12),   # Torso to hips
    (11, 13), (13, 15), # Left leg
    (12, 14), (14, 16), # Right leg
    (11, 12)            # Hips
]

def draw_pose(frame, keypoints, draw_skeleton=True, draw_labels=True):
    for idx, (x, y) in enumerate(keypoints):
        if x == 0 and y == 0:
            continue  # bỏ điểm không xác định

        pt = (int(x), int(y))

        # Vẽ điểm
        cv2.circle(frame, pt, 4, (0, 255, 0), -1)

        # Ghi label
        if draw_labels:
            label = f"{idx}:{keypoint_names[idx]}"
            cv2.putText(frame, label, (pt[0] + 5, pt[1] - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 0, 0), 1)

    if draw_skeleton:
        for a, b in skeleton:
            x1, y1 = keypoints[a]
            x2, y2 = keypoints[b]
            if x1 > 0 and y1 > 0 and x2 > 0 and y2 > 0:
                pt1 = (int(x1), int(y1))
                pt2 = (int(x2), int(y2))
                color = (255, 255, 255)
                cv2.line(frame, pt1, pt2, color, 2)

def display_pose(input_path, output_path):
    map_id_pose = {}
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"
    w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))
    counter = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        results = model.track(frame, persist=True, tracker="bytetrack.yaml")[0]
        for box in results.boxes:
            if hasattr(box, 'id') and box.id is not None:
                track_id = int(box.id[0])
                for person_kp in results[0].keypoints.xy:

                    draw_pose(frame,person_kp)
                    if check_standing_by_y_diff(person_kp, y_threshold=20) == "Standing":
                        angle = get_arm_angle(person_kp)
                        if angle > 140:
                            map_id_pose[track_id] = "pu"
                        elif angle < 100 and map_id_pose[track_id] == 'pu':
                            map_id_pose[track_id] = "pd"
                            counter +=1
                    cv2.putText(frame, f'{counter}', (100, 200),
                                cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 0), 2)
        # Vẽ kết quả lên frame
        out.write(frame)
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [164]:
def draw_only_skeleton(frame_shape, keypoints_xy):
    h, w = frame_shape[:2]
    output = np.zeros((h, w, 3), dtype=np.uint8)  # Nền đen

    for keypoints in keypoints_xy:
        for a, b in skeleton:
            x1, y1 = keypoints[a]
            x2, y2 = keypoints[b]
            if x1 > 0 and y1 > 0 and x2 > 0 and y2 > 0:
                pt1 = (int(x1), int(y1))
                pt2 = (int(x2), int(y2))
                color = (255, 255, 255)
                # color = skeleton_colors.get((a, b), skeleton_colors.get((b, a), (255, 255, 255)))
                cv2.line(output, pt1, pt2, color, 2)
    return output
def display_pose_raw_skeleton(input_path, output_path):
    map_id_pose = {}
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"
    w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))


    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        results = model.track(frame, persist=True, tracker="bytetrack.yaml")[0]
        for box in results.boxes:
            if hasattr(box, 'id') and box.id is not None:
                track_id = int(box.id[0])
                map_id_pose[track_id] = 1
        keypoints_xy = results[0].keypoints.xy.cpu().numpy()
        # Vẽ kết quả lên frame
        skeleton_frame = draw_only_skeleton(frame.shape, keypoints_xy)
        out.write(skeleton_frame)
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [172]:
# display_pose_raw_skeleton('../resources/Video_pose_push_up.mp4','../resources/Video_pose_push_up_yolo_raw_skeleton.mp4')
display_pose('../resources/Video_pose_push_up.mp4','../resources/Video_pose_push_up_yolo.mp4')


0: 384x640 1 person, 41.9ms
Speed: 2.2ms preprocess, 41.9ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 40.4ms
Speed: 1.5ms preprocess, 40.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 38.2ms
Speed: 1.0ms preprocess, 38.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 38.6ms
Speed: 1.1ms preprocess, 38.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 39.0ms
Speed: 1.3ms preprocess, 39.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)


  a = np.array(a)
  b = np.array(b)
  c = np.array(c)



0: 384x640 1 person, 43.1ms
Speed: 1.7ms preprocess, 43.1ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 37.2ms
Speed: 1.0ms preprocess, 37.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 36.3ms
Speed: 1.5ms preprocess, 36.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 37.2ms
Speed: 1.4ms preprocess, 37.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 38.4ms
Speed: 1.1ms preprocess, 38.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 37.8ms
Speed: 1.2ms preprocess, 37.8ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 38.9ms
Speed: 1.1ms preprocess, 38.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 39.0ms
Speed: 1.1ms preprocess, 39.0ms inference, 0.4ms postprocess per image at shape (1, 3, 38