This notebook uses YOLOv8 for people detection and then utilizes mediapipe for pose estimation. The process works by first detecting all people in a video frame, and then sending all those detected persons, one by one, to Mediapipe for pose estimation. The resulting output is then overlayed on each frame, resulting in a video with multiperson pose estimation.

# Install libraries

In [None]:
# install mediapipe
!pip install mediapipe
!pip install ultralytics

Collecting mediapipe
  Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency confli

In [None]:

import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.2.21 🚀 Python-3.10.12 torch-2.3.0+cu121 CPU (Intel Xeon 2.20GHz)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 30.0/107.7 GB disk)


In [None]:
# import dependencies
import os
import subprocess
import IPython
from IPython.display import Video, display

import numpy as np
import pandas as pd

import cv2
import mediapipe as  mp

# YOLOv8 + Mediapipe MultiShot MultiPose Estimation

## Approach
- We use YOLOv8 object detection to first find the bounding boxes of all the persons in the video.
- We use the bounding boxes on to indiviudally crop each frame to include only one person.
- We run MediaPipe single pose estimation on the cropped frame to detect the pose estimations of the person.
- We carry out this single pose estimation for each cropped bounding box in each frame, and finally annotate the video with the estimated pose landmarks.

### Function to display video in Google colab

In [None]:
def play_video(video_path: str):
    frac = 0.65 # scaling factor for display
    display(
        Video(data=video_path, embed=True, height=int(720*frac), width=int(1280*frac))
    )

### Perform detections on video file using YOLOv8

Our video is saved at location `/content/two_ppl.mp`

In [None]:
video_path = '/content/two_ppl.mp4'

from ultralytics import YOLO

# Load a pretrained YOLOv8n model
model = YOLO("yolov8n.pt")

# Run inference
model.predict("/content/two_ppl.mp4", save_txt=True, imgsz=640, show=True) # Results are saved to '/content/runs/detect/predict/labels' folder.


bbox_labels_path='/content/runs/detect/predict/labels'

### Function to convert bounding boxes labels into a dataframe format

In [None]:
def bbox_labels_to_dataframe(bbox_labels_path: str) -> pd.DataFrame:

    bbox_labels = {
        'video_name':[],
        'frame':[],
        'class_id':[],
        'center_x':[],
        'center_y':[],
        'width':[],
        'height':[]
    }

    for filename in os.listdir(bbox_labels_path):
        video_name = "_".join(filename.split('_')[0:3]) + '.mp4'
        frame = filename.split('_')[-1]
        frame = int(frame.split('.')[0])

        with open(bbox_labels_path + '/' + filename, 'r') as f:
            for line in f:
                line = line.split(" ")
                class_id = int(line[0])
                center_x = float(line[1])
                center_y = float(line[2])
                width = float(line[3])
                height = float(line[4])

                if class_id == 0: # if person
                    # append to dict
                    bbox_labels['video_name'].append(video_name)
                    bbox_labels['frame'].append(frame)
                    bbox_labels['class_id'].append(class_id)
                    bbox_labels['center_x'].append(center_x)
                    bbox_labels['center_y'].append(center_y)
                    bbox_labels['width'].append(width)
                    bbox_labels['height'].append(height)


    return pd.DataFrame(bbox_labels)

### Function to take a video path, the bounding box dataframe, and perform multiperson pose estimation using Mediapipe. It returns the output path of the processed video file, containing the overlayed detections

In [None]:
def multi_pose_estimation(video_path:str, bbox_labels: pd.DataFrame, verbose=True) -> str:
    """
    Performs multi-shot multi-pose estimation by obtianing person bbox from YOLOv8
    and performing single pose estimation on the bbox crop.
    """

    # intializing mediapipe utils
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles
    mp_pose = mp.solutions.pose

    # video name
    video_name = video_path.split('/')[-1]

    # VideoCapture Object
    cap = cv2.VideoCapture(video_path)

    # video variables
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = bbox_labels['frame'].max()

    # VideoWriter Object
    output_path  = "labeled_" + video_name
    tmp_output_path = 'tmp_' + output_path
    out = cv2.VideoWriter(tmp_output_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (width, height))

    # check if camera opened successfully
    if (cap.isOpened()==False):
        print('Error opening video file!')

    # multipose estimation
    with mp_pose.Pose(
        min_detection_confidence = 0.4,
        min_tracking_confidence = 0.4) as pose:
        frame = 1
        while (cap.isOpened()):
            success, image = cap.read()
            if success:
                # selecting the frame
                bbox_set = bbox_labels.query('frame==@frame')
                # iterating through bboxs in the frame
                for idx, annot in bbox_set.iterrows():
                    bbox_center_x = annot['center_x'] * width
                    bbox_center_y = annot['center_y'] * height
                    bbox_width = annot['width'] * width
                    bbox_height = annot['height'] * height

                    # finding top-left and bottom-right bbox cooridnates
                    bbox_top_left_x = int(bbox_center_x - (bbox_width/2))
                    bbox_top_left_y = int(bbox_center_y - (bbox_height/2))
                    bbox_bottom_right_x = int(bbox_center_x + (bbox_width/2))
                    bbox_bottom_right_y = int(bbox_center_y + (bbox_height/2))

                    # cropping image to bbox
                    image_crop = image[bbox_top_left_y:bbox_bottom_right_y, bbox_top_left_x:bbox_bottom_right_x]

                    # pose estimation
                    # set image as not writeable to improve perfromance
                    image_crop.flags.writeable = False
                    image_crop = cv2.cvtColor(image_crop, cv2.COLOR_BGR2RGB)
                    results = pose.process(image_crop)

                    # transposing results to be drawn on the original image
                    if results.pose_landmarks != None:
                        for landmark in results.pose_landmarks.landmark:
                            landmark.x = ((abs(bbox_bottom_right_x - bbox_top_left_x) / width) * landmark.x) + (bbox_top_left_x/width)
                            landmark.y = ((abs(bbox_bottom_right_y - bbox_top_left_y) / height) * landmark.y) + (bbox_top_left_y/height)

                        # draw the pose annotations on the image
                        # set image as writeable
                        image.flags.writeable = True
                        mp_drawing.draw_landmarks(
                            image,
                            results.pose_landmarks,
                            mp_pose.POSE_CONNECTIONS,
                            landmark_drawing_spec = mp_drawing_styles.get_default_pose_landmarks_style())



                # save video
                out.write(image)
                if verbose:
                    print(f'Frame: {frame}/{total_frames}')
                frame += 1
            else:
                break


        cap.release()
        out.release()


    # Not all browsers support the codec, we will re-load the file at tmp_output_path
    # and convert to a codec that is more broadly readable using ffmpeg
    if os.path.exists(output_path):
        os.remove(output_path)
    subprocess.run(
            [
                "ffmpeg",
                "-i",
                tmp_output_path,
                "-crf",
                "18",
                "-preset",
                "veryfast",
                "-hide_banner",
                "-loglevel",
                "error",
                "-vcodec",
                "libx264",
                output_path,
            ]
        )
    os.remove(tmp_output_path)

    return output_path

### Start pose estimation on video file

In [None]:
bbox_labels_path

'/content/runs/detect/predict/labels'

In [None]:
bbox_labels = bbox_labels_to_dataframe(bbox_labels_path)


In [None]:
bbox_labels

Unnamed: 0,video_name,frame,class_id,center_x,center_y,width,height
0,two_ppl_133.txt.mp4,133,0,0.313980,0.600205,0.221859,0.772390
1,two_ppl_133.txt.mp4,133,0,0.501199,0.516606,0.207078,0.927176
2,two_ppl_30.txt.mp4,30,0,0.515949,0.542203,0.182282,0.880402
3,two_ppl_30.txt.mp4,30,0,0.372023,0.636533,0.140908,0.681632
4,two_ppl_106.txt.mp4,106,0,0.296594,0.628202,0.201889,0.724733
...,...,...,...,...,...,...,...
388,two_ppl_163.txt.mp4,163,0,0.397804,0.577408,0.293833,0.807279
389,two_ppl_35.txt.mp4,35,0,0.378950,0.636646,0.159328,0.696024
390,two_ppl_35.txt.mp4,35,0,0.517914,0.543402,0.196994,0.874207
391,two_ppl_89.txt.mp4,89,0,0.468574,0.523782,0.207278,0.925204


In [None]:
output_path = multi_pose_estimation(video_path, bbox_labels)



Frame: 1/189
Frame: 2/189
Frame: 3/189
Frame: 4/189
Frame: 5/189
Frame: 6/189
Frame: 7/189
Frame: 8/189
Frame: 9/189
Frame: 10/189
Frame: 11/189
Frame: 12/189
Frame: 13/189
Frame: 14/189
Frame: 15/189
Frame: 16/189
Frame: 17/189
Frame: 18/189
Frame: 19/189
Frame: 20/189
Frame: 21/189
Frame: 22/189
Frame: 23/189
Frame: 24/189
Frame: 25/189
Frame: 26/189
Frame: 27/189
Frame: 28/189
Frame: 29/189
Frame: 30/189
Frame: 31/189
Frame: 32/189
Frame: 33/189
Frame: 34/189
Frame: 35/189
Frame: 36/189
Frame: 37/189
Frame: 38/189
Frame: 39/189
Frame: 40/189
Frame: 41/189
Frame: 42/189
Frame: 43/189
Frame: 44/189
Frame: 45/189
Frame: 46/189
Frame: 47/189
Frame: 48/189
Frame: 49/189
Frame: 50/189
Frame: 51/189
Frame: 52/189
Frame: 53/189
Frame: 54/189
Frame: 55/189
Frame: 56/189
Frame: 57/189
Frame: 58/189
Frame: 59/189
Frame: 60/189
Frame: 61/189
Frame: 62/189
Frame: 63/189
Frame: 64/189
Frame: 65/189
Frame: 66/189
Frame: 67/189
Frame: 68/189
Frame: 69/189
Frame: 70/189
Frame: 71/189
Frame: 72/189
F

In [None]:
play_video(output_path)

#### The detections are a bit jittery and can be improved though :(