In [5]:
import cv2 as cv
import mediapipe as mp
import csv
import copy
import itertools
from collections import deque
from pathlib import Path




In [None]:
def calc_landmark_list(image, landmarks):
    """Extract landmark coordinates."""
    image_width, image_height = image.shape[1], image.shape[0]
    landmark_point = []
    for landmark in landmarks.landmark:
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)
        landmark_point.append([landmark_x, landmark_y])
    return landmark_point


def pre_process_landmark(landmark_list):
    """Normalize landmarks to relative coordinates."""
    temp_landmark_list = copy.deepcopy(landmark_list)
    
    # Convert to relative coordinates
    base_x, base_y = temp_landmark_list[0][0], temp_landmark_list[0][1]
    for index, landmark_point in enumerate(temp_landmark_list):
        temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
        temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y
    
    # Flatten
    temp_landmark_list = list(itertools.chain.from_iterable(temp_landmark_list))
    
    # Normalization
    max_value = max(list(map(abs, temp_landmark_list)))
    temp_landmark_list = list(map(lambda n: n / max_value, temp_landmark_list))
    
    return temp_landmark_list


def draw_remaining_letters(image, collection_counts, target):
    """Draw list of letters that still need collection at bottom of screen."""
    y_start = image.shape[0] - 100
    cv.rectangle(image, (10, y_start), (image.shape[1] - 10, image.shape[0] - 10), (0, 0, 0), -1)
    
    text = "Remaining: "
    for letter in DYNAMIC_LETTERS:
        if collection_counts[letter] < target:
            text += f"{letter}({collection_counts[letter]}) "
    
    cv.putText(image, text, (20, y_start + 30),
               cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
    
    return image


def draw_landmarks(image, landmark_point):
    """Draw hand skeleton."""
    if len(landmark_point) > 0:
        # Thumb
        cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[3]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[3]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[3]), tuple(landmark_point[4]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[3]), tuple(landmark_point[4]), (255, 255, 255), 2)

        # Index finger
        cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[6]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[6]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[6]), tuple(landmark_point[7]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[6]), tuple(landmark_point[7]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[7]), tuple(landmark_point[8]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[7]), tuple(landmark_point[8]), (255, 255, 255), 2)

        # Middle finger
        cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[10]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[10]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[10]), tuple(landmark_point[11]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[10]), tuple(landmark_point[11]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[11]), tuple(landmark_point[12]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[11]), tuple(landmark_point[12]), (255, 255, 255), 2)

        # Ring finger
        cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[14]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[14]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[14]), tuple(landmark_point[15]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[14]), tuple(landmark_point[15]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[15]), tuple(landmark_point[16]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[15]), tuple(landmark_point[16]), (255, 255, 255), 2)

        # Pinky
        cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[18]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[18]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[18]), tuple(landmark_point[19]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[18]), tuple(landmark_point[19]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[19]), tuple(landmark_point[20]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[19]), tuple(landmark_point[20]), (255, 255, 255), 2)

        # Palm
        cv.line(image, tuple(landmark_point[0]), tuple(landmark_point[1]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[0]), tuple(landmark_point[1]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[1]), tuple(landmark_point[2]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[1]), tuple(landmark_point[2]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[5]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[5]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[9]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[9]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[13]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[13]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[17]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[17]), (255, 255, 255), 2)
        cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[0]), (0, 0, 0), 6)
        cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[0]), (255, 255, 255), 2)

    return image


def draw_collection_info(image, letter, collected, target, is_paused, buffer_size):
    """Draw collection progress at top."""
    cv.rectangle(image, (10, 10), (400, 180), (0, 0, 0), -1)
    
    if letter:
        cv.putText(image, f"Letter: {letter}", (20, 40),
                   cv.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
        cv.putText(image, f"Captured: {collected}/{target}", (20, 75),
                   cv.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
        pct = (collected / target * 100) if target > 0 else 0
        color = (0, 255, 0) if pct >= 100 else (255, 255, 255)
        cv.putText(image, f"{pct:.1f}%", (20, 105),
                   cv.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
        
        # Buffer status
        buffer_text = f"Buffer: {buffer_size}/5"
        buffer_color = (0, 255, 0) if buffer_size >= 5 else (255, 165, 0)
        cv.putText(image, buffer_text, (20, 135),
                   cv.FONT_HERSHEY_SIMPLEX, 0.6, buffer_color, 1)
        
        # Pause status
        pause_text = "PAUSED - Press SPACE" if is_paused else "Press SPACE to pause"
        pause_color = (0, 165, 255) if is_paused else (200, 200, 200)
        cv.putText(image, pause_text, (20, 165),
                   cv.FONT_HERSHEY_SIMPLEX, 0.6, pause_color, 1)
    else:
        cv.putText(image, "Press letter key", (20, 50),
                   cv.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
        cv.putText(image, "H, J, U, X, Z", (20, 85),
                   cv.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
        cv.putText(image, "ESC to quit", (20, 120),
                   cv.FONT_HERSHEY_SIMPLEX, 0.7, (200, 200, 200), 1)
    
    return image


def run_collection(output_csv):
    """Main collection loop for dynamic letters."""
    
    # Setup camera
    cap = cv.VideoCapture(0)
    cap.set(cv.CAP_PROP_FRAME_WIDTH, 960)
    cap.set(cv.CAP_PROP_FRAME_HEIGHT, 540)
    
    # Setup MediaPipe
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=1,
        min_detection_confidence=0.7,
        min_tracking_confidence=0.5
    )
    
    # Track collection per letter
    collection_counts = {letter: 0 for letter in DYNAMIC_LETTERS}
    current_letter = None
    is_paused = False
    
    # Rolling buffer for last 5 normalized landmark sets
    landmark_buffer = deque(maxlen=BUFFER_SIZE)
    
    print("Dynamic letter collection started.")
    print("Press letter keys (H, J, U, X, Z) to collect.")
    print("Press SPACE to pause/resume. Press ESC to quit.")
    
    while True:
        ret, image = cap.read()
        if not ret:
            break
        
        image = cv.flip(image, 1)
        debug_image = copy.deepcopy(image)
        
        # Process hand detection
        image_rgb = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        image_rgb.flags.writeable = False
        results = hands.process(image_rgb)
        image_rgb.flags.writeable = True
        
        # Handle key press
        key = cv.waitKey(10)
        
        if key == 27:  # ESC
            break
        
        # Toggle pause with SPACE
        if key == 32:  # SPACE
            is_paused = not is_paused
            if is_paused:
                landmark_buffer.clear()  # Clear buffer when pausing
            print(f"Collection {'paused' if is_paused else 'resumed'}")
        
        # Check for letter keys
        if 97 <= key <= 122:  # lowercase a-z
            letter = chr(key).upper()
            if letter in DYNAMIC_LETTERS:
                current_letter = letter
                is_paused = False
                landmark_buffer.clear()  # Clear buffer when switching letters
                print(f"Now collecting: {letter}")
        
        # Process and collect sample if hand detected, letter selected, and not paused
        if results.multi_hand_landmarks and current_letter and not is_paused:
            for hand_landmarks in results.multi_hand_landmarks:
                landmark_list = calc_landmark_list(debug_image, hand_landmarks)
                
                # Draw skeleton
                debug_image = draw_landmarks(debug_image, landmark_list)
                
                # Normalize current landmarks
                processed_landmarks = pre_process_landmark(landmark_list)
                
                # Add to buffer
                landmark_buffer.append(processed_landmarks)
                
                # Write sample if buffer is full
                if len(landmark_buffer) >= BUFFER_SIZE:
                    # Get oldest landmarks (5 frames ago)
                    old_landmarks = landmark_buffer[0]
                    current_landmarks = landmark_buffer[-1]
                    
                    # Compute delta features
                    delta_features = [curr - old for curr, old in zip(current_landmarks, old_landmarks)]
                    
                    # Prepare row: [label_index, current_landmarks, delta_features]
                    label_index = ord(current_letter) - ord('A')
                    row = [label_index] + current_landmarks + delta_features
                    
                    # Write to CSV
                    with open(output_csv, 'a', newline='', encoding='utf-8') as f:
                        writer = csv.writer(f)
                        writer.writerow(row)
                    
                    collection_counts[current_letter] += 1
        
        # Check completion
        all_complete = all(collection_counts[letter] >= TARGET_SAMPLES for letter in DYNAMIC_LETTERS)
        if all_complete:
            print("\nAll dynamic letters complete!")
            cap.release()
            cv.destroyAllWindows()
            break
        
        # Draw UI
        debug_image = draw_collection_info(
            debug_image,
            current_letter,
            collection_counts.get(current_letter, 0) if current_letter else 0,
            TARGET_SAMPLES,
            is_paused,
            len(landmark_buffer)
        )
        
        # Draw remaining letters list
        debug_image = draw_remaining_letters(debug_image, collection_counts, TARGET_SAMPLES)
        
        cv.imshow('Dynamic Hand Gesture Collection', debug_image)
    
    cap.release()
    cv.destroyAllWindows()
    
    print("\nCollection Summary:")
    for letter in DYNAMIC_LETTERS:
        count = collection_counts[letter]
        print(f"{letter}: {count}/{TARGET_SAMPLES}")
    
    return collection_counts




In [6]:
# Configuration
DYNAMIC_LETTERS = ['H', 'J', 'U', 'X', 'Z']
TARGET_SAMPLES = 2000
BUFFER_SIZE = 5
OUTPUT_CSV = Path('../data/dataset/_NEW_ngt_dynamic_keypoint.csv')

OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)


In [None]:
run_collection(OUTPUT_CSV)