In [2]:
import cv2
import numpy as np
import mediapipe as mp
import matplotlib.pyplot as plt

Load YOLO Model

In [3]:
net = cv2.dnn.readNet('/Users/suryanshpatel/Projects/Pose_detection/yolo/yolov3.weights', '/Users/suryanshpatel/Projects/Pose_detection/yolo/yolov3.cfg')
layer_names = net.getLayerNames()


# Adjust indexing to handle both cases where net.getUnconnectedOutLayers() returns a 1D or 2D array
try:
    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
except IndexError:
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

In [4]:
# Function to compare joint points
def compare_joint_points(user_points, professional_points):
    differences = []
    for user_point, professional_point in zip(user_points, professional_points):
        diff = np.linalg.norm(np.array(user_point) - np.array(professional_point))
        differences.append(diff)
    return differences

# Initialize MediaPipe Pose
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

I0000 00:00:1718565997.649749 4016892 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M2


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1718565997.799171 4017120 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1718565997.805977 4017117 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Main driver Code

In [46]:
cap = cv2.VideoCapture(0)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

all_joint_points = []
best_box_dimensions = []


while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    height, width, channels = frame.shape
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    # Track the best detection
    best_confidence = 0
    best_box = None

    # Get bounding box for the person with the highest confidence
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if class_id == 0 and confidence > best_confidence:  # Class 0 is for person
                best_confidence = confidence
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = max(0, int(center_x - w / 2))
                y = max(0, int(center_y - h / 2))
                w = min(w, width - x)
                h = min(h, height - y)
                best_box = (x, y, w, h)

    if best_box:
        x, y, w, h = best_box

        # Extract the region within the bounding box
        roi = frame[y:y+h, x:x+w]

        if roi.size == 0:  # Check if the ROI is empty
            continue

        # Process the region with MediaPipe for pose estimation
        rgb_frame = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
        results = pose.process(rgb_frame)

        # Capture joint points
        joint_points = []
        if results.pose_landmarks:
            for landmark in results.pose_landmarks.landmark:
                joint_points.append((landmark.x * w + x, landmark.y * h + y))

        
        #Surya - don't want to normalize now store as it is as we are storing box dimentions as well
        #  Normalize joint points within bounding box
        # normalized_points = normalize_joint_points_within_box(joint_points, (x, y, w, h))
        # all_joint_points.append(normalized_points)
        
        
        all_joint_points.append(joint_points)
        best_box_dimensions.append((x, y, w, h))

        # Draw the bounding box
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # Draw the pose annotation on the image
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(roi, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

    # Optionally display the image
    cv2.imshow('Pose Detection', frame)
    if cv2.waitKey(5) & 0xFF == 27:  # Press 'Esc' to exit
        break

# Release resources
cap.release()
cv2.destroyAllWindows()

# Convert to NumPy array
all_joint_points = np.array(all_joint_points , dtype=object)
best_box_dimensions = np.array(best_box_dimensions)

print(all_joint_points.shape , best_box_dimensions.shape)



(406,) (406, 4)


In [11]:
best_box_dimensions = np.array(best_box_dimensions)
best_box_dimensions.dtype

dtype('int64')

In [None]:
# Load professional's joint points (normalized) for comparison
# professional_joint_points = np.load('professional_joint_points.npy')

# Compare user's joint points with professional's
# differences = []
# for user_points in all_joint_points:
#     differences.append(compare_joint_points(user_points, professional_joint_points))

# # Convert differences to NumPy array for easier processing
# differences = np.array(differences)

# # Plot the results
# plt.figure(figsize=(10, 5))
# for i, joint_diff in enumerate(differences.T):
#     plt.plot(joint_diff, label=f'Joint {i+1}')
# plt.xlabel('Frame')
# plt.ylabel('Difference')
# plt.title('Joint Differences between User and Professional')
# plt.legend()
# plt.show()

In [45]:
all_joint_points[8]

[(352.6430822610855, 315.939789891243),
 (368.44150614738464, 303.62629356980324),
 (369.49876606464386, 304.2768477201462),
 (373.27290737628937, 307.6255923807621),
 (354.05673015117645, 304.817487180233),
 (351.71358346939087, 301.469540476799),
 (349.4052257537842, 298.1421529054642),
 (363.2787981033325, 322.73301842808723),
 (339.75286316871643, 291.79549875855446),
 (349.47095239162445, 323.88185772299767),
 (341.1703305244446, 313.4078561067581),
 (361.00844407081604, 360.02842622995377),
 (286.57259702682495, 276.7686038017273),
 (397.65850853919983, 363.7676848769188),
 (301.2435783147812, 227.27840754389763),
 (391.50967812538147, 347.5298194885254),
 (317.4966777563095, 255.740179002285),
 (389.32532572746277, 353.32173693180084),
 (316.1062331199646, 264.395151168108),
 (382.24276328086853, 339.88964545726776),
 (324.36191260814667, 272.4565527141094),
 (381.95830857753754, 335.1317709982395),
 (326.0780940055847, 270.24686846137047),
 (293.9131643772125, 421.3800995349884