In [10]:
import os
import cv2
import mediapipe as mp
from tqdm import tqdm

In [11]:
# Paths
input_dir = "../augmented_images"
output_dir = "../hand_images"

In [12]:
# Labels where both hands must be present
labels_with_both_hands = list("ABDEFGHJKMNPQRSTWXYZ")

In [13]:
# Mediapipe setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils









W0000 00:00:1738012479.575562  200014 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738012479.632448  200022 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [14]:
# Padding factor (adjust this to add more/less margin around the hand)
PADDING = 0.1  # 10% margin

In [15]:
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

In [16]:
# Function to process and crop images
def process_images(input_dir, output_dir):
    for label in os.listdir(input_dir):
        label_path = os.path.join(input_dir, label)
        if not os.path.isdir(label_path):
            continue

        output_label_path = os.path.join(output_dir, label)
        os.makedirs(output_label_path, exist_ok=True)

        for img_name in tqdm(os.listdir(label_path), desc=f"Processing label {label}"):
            img_path = os.path.join(label_path, img_name)
            img = cv2.imread(img_path)

            if img is None:
                continue

            # Convert the image to RGB
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            # Detect hand landmarks
            result = hands.process(img_rgb)

            if result.multi_hand_landmarks:
                # Check for the required number of hands
                num_hands = len(result.multi_hand_landmarks)
                if label in labels_with_both_hands and num_hands < 2:
                    continue

                h, w, _ = img.shape

                # Initialize variables to store the overall bounding box for all hands
                x_min, y_min = w, h
                x_max, y_max = 0, 0

                # Update the bounding box to include all hands
                for hand_landmarks in result.multi_hand_landmarks:
                    x_min = min(x_min, int(min([lm.x for lm in hand_landmarks.landmark]) * w))
                    x_max = max(x_max, int(max([lm.x for lm in hand_landmarks.landmark]) * w))
                    y_min = min(y_min, int(min([lm.y for lm in hand_landmarks.landmark]) * h))
                    y_max = max(y_max, int(max([lm.y for lm in hand_landmarks.landmark]) * h))

                # Add padding to the bounding box
                x_min = max(0, int(x_min - PADDING * (x_max - x_min)))
                x_max = min(w, int(x_max + PADDING * (x_max - x_min)))
                y_min = max(0, int(y_min - PADDING * (y_max - y_min)))
                y_max = min(h, int(y_max + PADDING * (y_max - y_min)))

                # Crop the region containing both hands
                cropped_img = img[y_min:y_max, x_min:x_max]

                # Save the cropped image
                output_path = os.path.join(output_label_path, img_name)
                cv2.imwrite(output_path, cropped_img)
            else:
                continue

In [17]:
# Run the process
process_images(input_dir, output_dir)

Processing label V: 100%|██████████| 5000/5000 [02:09<00:00, 38.57it/s]
Processing label S: 100%|██████████| 5000/5000 [02:35<00:00, 32.09it/s]
Processing label J: 100%|██████████| 5000/5000 [02:50<00:00, 29.28it/s]
Processing label W: 100%|██████████| 5000/5000 [02:31<00:00, 32.97it/s]
Processing label 6: 100%|██████████| 5000/5000 [02:08<00:00, 38.82it/s]
Processing label 9: 100%|██████████| 5000/5000 [02:08<00:00, 38.80it/s]
Processing label M: 100%|██████████| 5000/5000 [02:39<00:00, 31.26it/s]
Processing label Q: 100%|██████████| 5000/5000 [02:52<00:00, 28.99it/s]
Processing label K: 100%|██████████| 5000/5000 [02:51<00:00, 29.15it/s]
Processing label E: 100%|██████████| 5000/5000 [02:52<00:00, 29.06it/s]
Processing label R: 100%|██████████| 5000/5000 [02:36<00:00, 32.02it/s]
Processing label X: 100%|██████████| 5000/5000 [02:50<00:00, 29.33it/s]
Processing label D: 100%|██████████| 5000/5000 [02:50<00:00, 29.33it/s]
Processing label F: 100%|██████████| 5000/5000 [02:41<00:00, 31.

In [9]:
# Release Mediapipe resources
hands.close()