In [1]:
import subprocess
import os
import cv2
import json



## Convert webm video to mp4

Convert the video from Vera de Kok from webm to mp4 for ease of use

In [2]:
def convert_webm_to_mp4(input_path, output_path=None):
    """
    Convert WebM video to MP4 format for better OpenCV compatibility.
    
    Args:
        input_path: Path to input .webm file
        output_path: Path for output .mp4 file (optional)
    
    Returns:
        Path to converted file
    """
    if output_path is None:
        output_path = input_path.replace('.webm', '.mp4')
    
    command = [
        'ffmpeg',
        '-i', input_path,
        '-c:v', 'libx264',  # H264 codec
        '-preset', 'medium',  # Balance speed/quality
        '-crf', '23',  # Quality (lower = better, 18-28 typical)
        '-c:a', 'aac',  # Audio codec
        '-b:a', '128k',  # Audio bitrate
        '-y',  # Overwrite output file
        output_path
    ]
    
    try:
        subprocess.run(command, check=True)
        print(f"Conversion successful: {output_path}")
        return output_path
    except subprocess.CalledProcessError as e:
        print(f"Conversion failed: {e}")
        return None



In [None]:
# only need to do this once!
# convert_webm_to_mp4('../assets/NGT_handalfabet.webm', '../assets/NGT_handalfabet.mp4')

## Make timestamp markers

Divide the video by timestamp markers to make playback easier

In [None]:
import cv2
import json

def create_timestamp_markers(video_path, output_json='letter_timestamps.json'):
    """
    Interactive tool to mark timestamp boundaries for each letter A-Z.
    Uses live preview with looping playback.
    
    Controls:
    - SPACE: Replay current segment
    - H/L: Adjust start time -/+ 0.05s (vim left/right)
    - J/K: Adjust end time -/+ 0.05s (vim down/up)  
    - Shift+H/L: Adjust start time -/+ 0.5s (coarse)
    - Shift+J/K: Adjust end time -/+ 0.5s (coarse)
    - Ctrl+N: Accept and move to next letter
    - Ctrl+P: Go back to previous letter
    - Q: Quit and save
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    
    letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    timestamps = {}  # Will store {letter: {"start": x, "end": y}}
    
    current_letter_idx = 0
    
    # Initialize first letter with reasonable defaults
    current_start = 0.0
    current_end = min(2.0, duration)
    
    print(f"Video: {duration:.1f}s, {fps:.1f} FPS")
    print("\nControls:")
    print("  SPACE: Replay segment")
    print("  H/L: Start time -/+ 0.05s")
    print("  J/K: End time -/+ 0.05s")
    print("  Shift+H/L: Start time -/+ 0.5s")
    print("  Shift+J/K: End time -/+ 0.5s")
    print("  Ctrl+N: Next letter")
    print("  Ctrl+P: Previous letter")
    print("  Q: Quit and save")
    print(f"\n{'='*60}")
    
    cv2.namedWindow('Mark Timestamps')
    
    while current_letter_idx < len(letters):
        letter = letters[current_letter_idx]
        
        # If returning to a previously marked letter, load its values
        if letter in timestamps:
            current_start = timestamps[letter]["start"]
            current_end = timestamps[letter]["end"]
        
        print(f"\nMarking letter: {letter} ({current_letter_idx + 1}/26)")
        print(f"  Start: {current_start:.2f}s | End: {current_end:.2f}s")
        
        adjusting = True
        
        while adjusting:
            # Play segment in loop
            start_frame = int(current_start * fps)
            end_frame = int(current_end * fps)
            
            # Ensure valid range
            start_frame = max(0, min(start_frame, total_frames - 1))
            end_frame = max(start_frame + 1, min(end_frame, total_frames))
            
            cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
            current_frame = start_frame
            
            loop_count = 0
            
            while current_frame < end_frame and adjusting:
                ret, frame = cap.read()
                if not ret:
                    current_frame = start_frame
                    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
                    continue
                
                # Display info
                display_frame = frame.copy()
                current_time = current_frame / fps
                
                cv2.putText(display_frame, f"Letter: {letter} ({current_letter_idx + 1}/26)", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(display_frame, f"Start: {current_start:.2f}s | End: {current_end:.2f}s | Dur: {current_end - current_start:.2f}s", 
                           (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
                cv2.putText(display_frame, f"Current: {current_time:.2f}s", 
                           (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (200, 200, 200), 1)
                
                # Show marked letters
                marked_count = len(timestamps)
                cv2.putText(display_frame, f"Marked: {marked_count}/26", 
                           (10, display_frame.shape[0] - 10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1)
                
                cv2.imshow('Mark Timestamps', display_frame)
                
                # Check for key press
                key = cv2.waitKey(int(1000/fps)) & 0xFF
                
                if key == ord(' '):
                    # Replay - restart loop
                    current_frame = start_frame - 1
                
                elif key == ord('h'):
                    # Decrease start time
                    current_start = max(0, round(current_start - 0.05, 2))
                    print(f"  Start: {current_start:.2f}s")
                    break
                
                elif key == ord('l'):
                    # Increase start time
                    current_start = min(current_end - 0.1, round(current_start + 0.05, 2))
                    print(f"  Start: {current_start:.2f}s")
                    break
                
                elif key == ord('j'):
                    # Decrease end time
                    current_end = max(current_start + 0.1, round(current_end - 0.05, 2))
                    print(f"  End: {current_end:.2f}s")
                    break
                
                elif key == ord('k'):
                    # Increase end time
                    current_end = min(duration, round(current_end + 0.05, 2))
                    print(f"  End: {current_end:.2f}s")
                    break
                
                elif key == ord('H'):
                    # Decrease start time (coarse)
                    current_start = max(0, round(current_start - 0.5, 2))
                    print(f"  Start: {current_start:.2f}s")
                    break
                
                elif key == ord('L'):
                    # Increase start time (coarse)
                    current_start = min(current_end - 0.1, round(current_start + 0.5, 2))
                    print(f"  Start: {current_start:.2f}s")
                    break
                
                elif key == ord('J'):
                    # Decrease end time (coarse)
                    current_end = max(current_start + 0.1, round(current_end - 0.5, 2))
                    print(f"  End: {current_end:.2f}s")
                    break
                
                elif key == ord('K'):
                    # Increase end time (coarse)
                    current_end = min(duration, round(current_end + 0.5, 2))
                    print(f"  End: {current_end:.2f}s")
                    break
                
                elif key == 14:  # Ctrl+N
                    # Save and move to next
                    timestamps[letter] = {"start": current_start, "end": current_end}
                    print(f"  Saved {letter}: {current_start:.2f}s - {current_end:.2f}s")
                    current_letter_idx += 1
                    # Set next letter to start where this one ended
                    current_start = current_end
                    current_end = min(current_start + 2.0, duration)
                    adjusting = False
                    break
                
                elif key == 16:  # Ctrl+P
                    # Go to previous letter
                    if current_letter_idx > 0:
                        current_letter_idx -= 1
                        adjusting = False
                        break
                
                elif key == ord('q'):
                    # Quit and save
                    cap.release()
                    cv2.destroyAllWindows()
                    
                    # Save what we have
                    with open(output_json, 'w') as f:
                        json.dump(timestamps, f, indent=2)
                    
                    print(f"\nSaved {len(timestamps)} timestamps to {output_json}")
                    return timestamps
                
                current_frame += 1
                
                # Loop back to start
                if current_frame >= end_frame:
                    current_frame = start_frame
                    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
                    loop_count += 1
    
    # Finished all letters
    cap.release()
    cv2.destroyAllWindows()
    
    with open(output_json, 'w') as f:
        json.dump(timestamps, f, indent=2)
    
    print(f"\nSaved all 26 timestamps to {output_json}")
    return timestamps



In [4]:
# only need to run this once or to adjust markers
# timestamps = create_timestamp_markers('../assets/letter_timestamps.json')

## Proof of Concet Video Players

### Simple Hint Player

In [20]:
timestamps_path = '../assets/letter_timestamps.json'
video_path = '../assets/NGT_handalfabet.mp4'
landmarks_pkl = '../assets/video_landmarks.pkl'

In [14]:
import cv2
import json

class VideoHintPlayer:
    """
    Plays video segments as hints for finger spelling letters.
    """
    def __init__(self, video_path, timestamps_path='letter_timestamps.json'):
        self.video_path = video_path
        
        # Load timestamps
        with open(timestamps_path, 'r') as f:
            self.timestamps = json.load(f)
        
        print(f"Loaded {len(self.timestamps)} letter hints")
    
    def play_hint(self, letter, window_name='Hint'):
        """
        Play video segment for specified letter.
        
        Args:
            letter: Letter to show (A-Z)
            window_name: OpenCV window name
        
        Returns:
            True if played successfully, False otherwise
        """
        letter = letter.upper()
        
        if letter not in self.timestamps:
            print(f"No timestamp for letter: {letter}")
            return False
        
        cap = cv2.VideoCapture(self.video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        # Get start and end times
        start_time = self.timestamps[letter]["start"]
        end_time = self.timestamps[letter]["end"]
        
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        
        current_frame = start_frame
        
        while current_frame < end_frame:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Add letter label
            cv2.putText(frame, f"Letter: {letter}", 
                       (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            cv2.imshow(window_name, frame)
            
            # Wait appropriate time for playback speed
            # Allow 'q' to skip hint early
            if cv2.waitKey(int(1000/fps)) & 0xFF == ord('q'):
                break
            
            current_frame += 1
        
        cap.release()
        cv2.destroyWindow(window_name)
        
        return True

In [15]:
# Test a few letters
hint_player = VideoHintPlayer(video_path=video_path, timestamps_path=timestamps_path)
for letter in ['T', 'A', 'C', 'O']:
    print(f"Playing hint for {letter}...")
    hint_player.play_hint(letter)

Loaded 26 letter hints
Playing hint for T...
Playing hint for A...
Playing hint for C...
Playing hint for O...


### Video Player with Landmarks

Add skeleton landmarks for playback

In [19]:
import cv2
import json
import mediapipe as mp

class AnnotatedVideoHintPlayer:
    """
    Plays video segments with hand landmark annotations to help users
    understand finger positioning.
    """
    def __init__(self, video_path, timestamps_path='letter_timestamps.json'):
        self.video_path = video_path
        
        # Load timestamps
        with open(timestamps_path, 'r') as f:
            self.timestamps = json.load(f)
        
        # Initialize MediaPipe
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.5
        )
        
        print(f"Loaded {len(self.timestamps)} letter hints")
    
    def play_hint(self, letter, window_name='Hint - Watch Hand Position', show_landmarks=True):
        """
        Play video segment with optional hand landmark overlay.
        
        Args:
            letter: Letter to show (A-Z)
            window_name: OpenCV window name
            show_landmarks: Whether to overlay hand skeleton
        
        Returns:
            True if played successfully, False otherwise
        """
        letter = letter.upper()
        
        if letter not in self.timestamps:
            print(f"No timestamp for letter: {letter}")
            return False
        
        cap = cv2.VideoCapture(self.video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        # Get start and end times
        start_time = self.timestamps[letter]["start"]
        end_time = self.timestamps[letter]["end"]
        
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        
        current_frame = start_frame
        
        while current_frame < end_frame:
            ret, frame = cap.read()
            if not ret:
                break
            
            display_frame = frame.copy()
            
            # Add hand landmarks if requested
            if show_landmarks:
                # Convert to RGB for MediaPipe
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = self.hands.process(frame_rgb)
                
                if results.multi_hand_landmarks:
                    for hand_landmarks in results.multi_hand_landmarks:
                        # Draw landmarks
                        self.mp_drawing.draw_landmarks(
                            display_frame,
                            hand_landmarks,
                            self.mp_hands.HAND_CONNECTIONS,
                            self.mp_drawing_styles.get_default_hand_landmarks_style(),
                            self.mp_drawing_styles.get_default_hand_connections_style()
                        )
            
            # Add helpful text
            cv2.putText(display_frame, f"Letter: {letter}", 
                       (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.putText(display_frame, "Watch the hand position", 
                       (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
            
            cv2.imshow(window_name, display_frame)
            
            # Allow early exit
            if cv2.waitKey(int(1000/fps)) & 0xFF == ord('q'):
                break
            
            current_frame += 1
        
        cap.release()
        cv2.destroyWindow(window_name)
        
        return True
    
    def __del__(self):
        if hasattr(self, 'hands'):
            self.hands.close()



In [18]:
# Test a few letters
hint_player = AnnotatedVideoHintPlayer(video_path=video_path, timestamps_path=timestamps_path)
for letter in ['T', 'A', 'C', 'O']:
    print(f"Playing hint for {letter}...")
    hint_player.play_hint(letter)

I0000 00:00:1768937450.188827       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Loaded 26 letter hints
Playing hint for T...
Playing hint for A...
Playing hint for C...
Playing hint for O...


### Extract Landmarks and Playback Skeleton

Extract the landmarks from the videos and then playback just the skeleton, or use with a hand model

In [21]:
import cv2
import json
import mediapipe as mp
import pickle

class VideoLandmarkExtractor:
    """
    Extract hand landmarks from reference video segments.
    Saves them for later use in animated hints.
    """
    def __init__(self, video_path, timestamps_path='letter_timestamps.json'):
        self.video_path = video_path
        
        # Load timestamps
        with open(timestamps_path, 'r') as f:
            self.timestamps = json.load(f)
        
        # Initialize MediaPipe
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        print(f"Loaded {len(self.timestamps)} letter segments")
    
    def extract_landmarks_for_letter(self, letter):
        """
        Extract landmarks from video segment for given letter.
        
        Args:
            letter: Letter to extract (A-Z)
        
        Returns:
            List of landmark frames, each frame is a list of 21 (x,y,z) coordinates
        """
        letter = letter.upper()
        
        if letter not in self.timestamps:
            print(f"No timestamp for letter: {letter}")
            return None
        
        cap = cv2.VideoCapture(self.video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        # Get start and end times
        start_time = self.timestamps[letter]["start"]
        end_time = self.timestamps[letter]["end"]
        
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        
        landmarks_sequence = []
        current_frame = start_frame
        
        print(f"Extracting landmarks for {letter}...")
        
        while current_frame < end_frame:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Process with MediaPipe
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.hands.process(frame_rgb)
            
            if results.multi_hand_landmarks:
                # Extract landmarks as list of (x, y, z)
                hand_landmarks = results.multi_hand_landmarks[0]
                frame_landmarks = []
                
                for landmark in hand_landmarks.landmark:
                    frame_landmarks.append([landmark.x, landmark.y, landmark.z])
                
                landmarks_sequence.append(frame_landmarks)
            else:
                # No hand detected in this frame
                # Use previous frame's landmarks if available, otherwise None
                if landmarks_sequence:
                    landmarks_sequence.append(landmarks_sequence[-1])
                else:
                    landmarks_sequence.append(None)
            
            current_frame += 1
        
        cap.release()
        
        # Filter out None frames
        landmarks_sequence = [lm for lm in landmarks_sequence if lm is not None]
        
        print(f"  Extracted {len(landmarks_sequence)} frames")
        
        return landmarks_sequence
    
    def extract_all_letters(self, output_file='video_landmarks.pkl'):
        """
        Extract landmarks for all letters and save to file.
        
        Args:
            output_file: Path to save extracted landmarks
        
        Returns:
            Dictionary mapping letters to landmark sequences
        """
        all_landmarks = {}
        
        letters = sorted(self.timestamps.keys())
        
        for letter in letters:
            landmarks = self.extract_landmarks_for_letter(letter)
            if landmarks:
                all_landmarks[letter] = landmarks
        
        # Save to file
        with open(output_file, 'wb') as f:
            pickle.dump(all_landmarks, f)
        
        print(f"\nSaved all landmarks to {output_file}")
        print(f"Total letters: {len(all_landmarks)}")
        
        return all_landmarks
    
    def __del__(self):
        if hasattr(self, 'hands'):
            self.hands.close()

# Extract landmarks from video
# extractor = VideoLandmarkExtractor(output_file)
# video_landmarks = extractor.extract_all_letters()

In [23]:
import cv2
import numpy as np
import pickle
import mediapipe as mp

class AnimatedSkeletonHintPlayer:
    """
    Plays animated hand skeleton from extracted video landmarks.
    Shows smooth, looping animation of the correct hand pose.
    """
    def __init__(self, landmarks_file='video_landmarks.pkl'):
        # Load extracted landmarks
        with open(landmarks_file, 'rb') as f:
            self.landmarks = pickle.load(f)
        
        # MediaPipe for drawing
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        
        print(f"Loaded landmarks for {len(self.landmarks)} letters")
    
    def play_animated_hint(self, letter, window_name='Animated Hint', 
                          image_size=(640, 480), loop_count=3, playback_speed=1.0):
        """
        Play animated skeleton for given letter.
        
        Args:
            letter: Letter to show (A-Z)
            window_name: OpenCV window name
            image_size: Size of display window
            loop_count: Number of times to loop animation
            playback_speed: Speed multiplier (1.0 = normal, 2.0 = double speed)
        
        Returns:
            True if played successfully
        """
        letter = letter.upper()
        
        if letter not in self.landmarks:
            print(f"No landmarks for letter: {letter}")
            return False
        
        landmark_sequence = self.landmarks[letter]
        
        # Calculate frame delay
        base_fps = 30  # Approximate original video FPS
        frame_delay = int((1000 / base_fps) / playback_speed)
        
        for loop in range(loop_count):
            for frame_landmarks in landmark_sequence:
                # Create blank canvas
                canvas = np.zeros((image_size[1], image_size[0], 3), dtype=np.uint8)
                
                # Convert landmarks to MediaPipe format
                from mediapipe.framework.formats import landmark_pb2
                
                hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
                for x, y, z in frame_landmarks:
                    landmark = hand_landmarks_proto.landmark.add()
                    landmark.x = x
                    landmark.y = y
                    landmark.z = z
                
                # Draw on canvas
                self.mp_drawing.draw_landmarks(
                    canvas,
                    hand_landmarks_proto,
                    self.mp_hands.HAND_CONNECTIONS,
                    self.mp_drawing_styles.get_default_hand_landmarks_style(),
                    self.mp_drawing_styles.get_default_hand_connections_style()
                )
                
                # Add text
                cv2.putText(canvas, f"Letter: {letter}", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(canvas, "Watch hand movement", 
                           (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
                cv2.putText(canvas, f"Loop {loop + 1}/{loop_count}", 
                           (10, image_size[1] - 10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (200, 200, 200), 1)
                
                cv2.imshow(window_name, canvas)
                
                # Allow early exit
                if cv2.waitKey(frame_delay) & 0xFF == ord('q'):
                    cv2.destroyWindow(window_name)
                    return True
        
        cv2.destroyWindow(window_name)
        return True

# # Initialize
# animated_hint_player = AnimatedSkeletonHintPlayer()

In [27]:
# Step 1: Extract landmarks from video (run once)
# only need to run once!
extractor = VideoLandmarkExtractor(video_path=video_path, timestamps_path=timestamps_path)
video_landmarks = extractor.extract_all_letters(landmarks_pkl)



Loaded 26 letter segments
Extracting landmarks for A...


I0000 00:00:1768937698.100111       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2


  Extracted 23 frames
Extracting landmarks for B...
  Extracted 36 frames
Extracting landmarks for C...
  Extracted 33 frames
Extracting landmarks for D...
  Extracted 40 frames
Extracting landmarks for E...
  Extracted 31 frames
Extracting landmarks for F...
  Extracted 20 frames
Extracting landmarks for G...
  Extracted 36 frames
Extracting landmarks for H...
  Extracted 38 frames
Extracting landmarks for I...
  Extracted 36 frames
Extracting landmarks for J...
  Extracted 28 frames
Extracting landmarks for K...
  Extracted 67 frames
Extracting landmarks for L...
  Extracted 31 frames
Extracting landmarks for M...
  Extracted 28 frames
Extracting landmarks for N...
  Extracted 29 frames
Extracting landmarks for O...
  Extracted 40 frames
Extracting landmarks for P...
  Extracted 75 frames
Extracting landmarks for Q...
  Extracted 84 frames
Extracting landmarks for R...
  Extracted 74 frames
Extracting landmarks for S...
  Extracted 42 frames
Extracting landmarks for T...
  Extracted 

In [28]:
# Step 2: Test animated playback
animated_hint_player = AnimatedSkeletonHintPlayer(landmarks_pkl)



Loaded landmarks for 26 letters


In [29]:
# Test a few letters
for letter in ['T', 'A', 'C', 'O', 'S']:
    print(f"Playing animated hint for {letter}...")
    animated_hint_player.play_animated_hint(letter, loop_count=1)

Playing animated hint for T...
Playing animated hint for A...
Playing animated hint for C...
Playing animated hint for O...
Playing animated hint for S...


## TESTING