In [2]:
%pip install ultralytics

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ultralytics import YOLO
import os
import json
import cv2

In [4]:
current_working_directory = os.getcwd()

YOLO_path = os.path.join(current_working_directory, "models/yolov8n-pose.pt") #path to the YOLO model
datasetDir = os.path.join(current_working_directory, "resources", "test_videos") #change here to the other dataset like RFC-2000
newKeypointsDir = os.path.join(current_working_directory, "Keypoints_Dataset") #The new json data are saved here
os.makedirs(newKeypointsDir, exist_ok=True)

In [5]:

model = YOLO(YOLO_path) #staring the model

In [6]:
#making the labels in the COCO format for YOLO model
labeledKeypoints = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
    "left_wrist", "right_wrist", "left_hip", "right_hip",
    "left_knee", "right_knee", "left_ankle", "right_ankle"
]

In [7]:
for video_file in os.listdir(datasetDir):
    if video_file.endswith(('.avi', '.mp4')):
        videoName = os.path.splitext(video_file)[0]
        videoPath = os.path.join(datasetDir, video_file)
        jsonOutputPath = os.path.join(newKeypointsDir, f"{videoName}.json")
        
        videoCapture = cv2.VideoCapture(videoPath)
        frameIndex = 0
        keyPointsData = []

        while videoCapture.isOpened():
            ret, frame = videoCapture.read()
            if not ret:
                break
            
            results = model(frame) #start YOLO on a frame
            newFrameData = []
            
            # We go trough every person in the frame
            for result in results:
                boxes = result.boxes
                keypoints = result.keypoints

                if boxes is not None and len(boxes) > 0:
                    for i in range(len(boxes)):
                        box_data = boxes.xyxy[i].cpu().numpy()
                        confidence = boxes.conf[i].cpu().item()
                        box = {
                            "x1": float(box_data[0]),
                            "y1": float(box_data[1]),
                            "x2": float(box_data[2]),
                            "y2": float(box_data[3])
                        }

                        keypoints_data = []
                        if keypoints is not None:
                            keypoints_array = keypoints.data[i].cpu().numpy()
                            for j, (x, y, conf) in enumerate(keypoints_array):
                                keypoints_data.append({
                                    "label": labeledKeypoints[j],
                                    "coordinates": {"x": float(x), "y": float(y)},
                                    "confidence": float(conf)
                                })

                        newFrameData.append({
                            "person_id": i + 1,
                            "confidence": confidence,
                            "box": box,
                            "keypoints": keypoints_data
                        })

            keyPointsData.append({"frame": frameIndex, "detections": newFrameData})
            frameIndex += 1
        
        videoCapture.release()

        # Save the Json data
        with open(jsonOutputPath, 'w') as json_file:
            json.dump(keyPointsData, json_file, indent=4)

print("Done")



0: 384x640 9 persons, 83.0ms
Speed: 9.2ms preprocess, 83.0ms inference, 4.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 68.0ms
Speed: 9.9ms preprocess, 68.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 71.4ms
Speed: 9.8ms preprocess, 71.4ms inference, 4.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 73.3ms
Speed: 12.5ms preprocess, 73.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 73.9ms
Speed: 8.8ms preprocess, 73.9ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 74.5ms
Speed: 3.5ms preprocess, 74.5ms inference, 3.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 78.1ms
Speed: 6.1ms preprocess, 78.1ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 77.4ms
Speed: 5.5ms preprocess, 77.4ms inference, 0.0ms postprocess per image at s