# Dynamic & Static Inference Proof of Concept

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import joblib
from collections import deque
import itertools
import copy




## Configuration

In [2]:
# ============================================================
# CONFIGURATION
# ============================================================

STATIC_MODEL_PATH = '../models/ngt_static_classifier.pkl'
DYNAMIC_MODEL_PATH = '../models/ngt_dynamic_classifier.pkl'

ROLLING_WINDOW_SIZE = 5  # Number of frames for delta calculation

STATIC_LETTERS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 
                  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
DYNAMIC_LETTERS = ['H', 'J', 'U', 'X', 'Z']

# Add this to the configuration section at the top
WRIST_MOTION_THRESHOLD = 0.1  # Threshold for detecting dynamic gestures
WRIST_MOTION_WINDOW = 10       # Number of frames to track wrist movement
MIN_CONFIDENCE_THRESHOLD = 70  # Minimum confidence to display prediction


# Load models
static_model = joblib.load(STATIC_MODEL_PATH)
dynamic_model = joblib.load(DYNAMIC_MODEL_PATH)

## Functions

In [3]:
# ============================================================
# LANDMARK PROCESSING (matches collection.py)
# ============================================================

def calc_landmark_list(image, landmarks):
    """Extract landmark coordinates from MediaPipe (x,y only)."""
    image_width, image_height = image.shape[1], image.shape[0]
    landmark_point = []
    for landmark in landmarks.landmark:
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)
        landmark_point.append([landmark_x, landmark_y])
    return landmark_point


def pre_process_landmark(landmark_list):
    """Normalize landmarks to relative coordinates (matches collection.py)."""
    temp_landmark_list = copy.deepcopy(landmark_list)
    
    # Convert to relative coordinates (relative to wrist)
    base_x, base_y = temp_landmark_list[0][0], temp_landmark_list[0][1]
    for index, landmark_point in enumerate(temp_landmark_list):
        temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
        temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y
    
    # Flatten
    temp_landmark_list = list(itertools.chain.from_iterable(temp_landmark_list))
    
    # Normalization
    max_value = max(list(map(abs, temp_landmark_list)))
    if max_value > 0:
        temp_landmark_list = list(map(lambda n: n / max_value, temp_landmark_list))
    
    return temp_landmark_list

In [4]:
# ============================================================
# PREDICTION FUNCTIONS
# ============================================================

def predict_static(normalized_landmarks):
    """Predict static letter from 42 normalized features."""
    prediction_idx = static_model.predict([normalized_landmarks])[0]
    probabilities = static_model.predict_proba([normalized_landmarks])[0]
    confidence = max(probabilities) * 100
    
    # Convert index to letter
    letter = chr(prediction_idx + ord('A'))
    
    return letter, confidence


def predict_dynamic(landmark_buffer):
    """Predict dynamic letter from current frame + delta features."""
    if len(landmark_buffer) < ROLLING_WINDOW_SIZE:
        return None, 0.0
    
    # Get current frame and frame from ROLLING_WINDOW_SIZE ago
    current_landmarks = landmark_buffer[-1]
    old_landmarks = landmark_buffer[0]
    
    # Compute delta features
    delta_features = [curr - old for curr, old in zip(current_landmarks, old_landmarks)]
    
    # Concatenate: [current_42, delta_42] = 84 features
    features = current_landmarks + delta_features
    
    prediction_idx = dynamic_model.predict([features])[0]
    probabilities = dynamic_model.predict_proba([features])[0]
    confidence = max(probabilities) * 100
    
    # Convert index to letter
    letter = chr(prediction_idx + ord('A'))
    
    return letter, confidence

In [5]:
def calculate_wrist_motion(wrist_positions):
    """Calculate total wrist movement over the window."""
    if len(wrist_positions) < 2:
        return 0.0
    
    total_distance = 0.0
    positions = list(wrist_positions)
    
    for i in range(len(positions) - 1):
        diff = np.array(positions[i+1]) - np.array(positions[i])
        distance = np.linalg.norm(diff)
        total_distance += distance
    
    return total_distance

In [7]:
# ============================================================
# MAIN LOOP
# ============================================================

# Setup MediaPipe
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Rolling buffer for normalized landmarks
landmark_buffer = deque(maxlen=ROLLING_WINDOW_SIZE)
wrist_buffer = deque(maxlen=WRIST_MOTION_WINDOW)

# Setup camera
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 960)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 540)

print("\nNGT Fingerspelling Recognizer")
print(f"Rolling window size: {ROLLING_WINDOW_SIZE} frames")
print("Press 'q' to quit\n")

with mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
) as hands:
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame = cv2.flip(frame, 1)
        
        # Convert to RGB for MediaPipe
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)
        
        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0]
            
            # Draw hand skeleton
            mp_drawing.draw_landmarks(
                frame, hand_landmarks, mp_hands.HAND_CONNECTIONS
            )
            
            # Process landmarks
            landmark_list = calc_landmark_list(frame, hand_landmarks)
            normalized_landmarks = pre_process_landmark(landmark_list)
            
            # Track wrist position (landmark 0, use first 2 coords for motion)
            wrist_landmark = hand_landmarks.landmark[0]
            wrist_pos = [wrist_landmark.x, wrist_landmark.y, wrist_landmark.z]
            wrist_buffer.append(wrist_pos)

            # Calculate wrist motion
            wrist_motion = calculate_wrist_motion(wrist_buffer)
            
            # Add to buffer
            landmark_buffer.append(normalized_landmarks)
            
            # Predict static (always available)
            static_letter, static_conf = predict_static(normalized_landmarks)
            
            # Predict dynamic (only if buffer is full)
            if len(landmark_buffer) >= ROLLING_WINDOW_SIZE:
                dynamic_letter, dynamic_conf = predict_dynamic(landmark_buffer)
            else:
                dynamic_letter, dynamic_conf = None, 0.0
            
            # SUPERVISOR DECISION LOGIC
            is_dynamic_motion = wrist_motion > WRIST_MOTION_THRESHOLD
            
            if is_dynamic_motion and dynamic_letter is not None:
                # Use dynamic prediction
                final_letter = dynamic_letter
                final_conf = dynamic_conf
                source = "DYNAMIC"
                color = (255, 0, 255)
            else:
                # Use static prediction
                final_letter = static_letter
                final_conf = static_conf
                source = "STATIC"
                color = (255, 255, 255)
            
            # Display final prediction (large)
            cv2.putText(frame, final_letter,
                       (50, 120), cv2.FONT_HERSHEY_SIMPLEX, 4, color, 8)
            
            cv2.putText(frame, f"{final_conf:.0f}% - {source}",
                       (50, 170), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
            
            # Debug panel with dark background
            panel_x = 20
            panel_y = 230
            panel_width = 450
            panel_height = 120
            
            # Dark background panel
            overlay = frame.copy()
            cv2.rectangle(overlay, (panel_x, panel_y), 
                         (panel_x + panel_width, panel_y + panel_height), 
                         (0, 0, 0), -1)
            cv2.addWeighted(overlay, 0.7, frame, 0.3, 0, frame)
            
            # Debug text on dark background
            y_pos = panel_y + 25
            cv2.putText(frame, f"Motion: {wrist_motion:.3f} | Threshold: {WRIST_MOTION_THRESHOLD:.3f}",
                       (panel_x + 10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
            y_pos += 30
            
            cv2.putText(frame, f"Static:  {static_letter} ({static_conf:.0f}%)",
                       (panel_x + 10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (200, 200, 200), 1)
            y_pos += 30
            
            if dynamic_letter:
                cv2.putText(frame, f"Dynamic: {dynamic_letter} ({dynamic_conf:.0f}%)",
                           (panel_x + 10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 100, 255), 1)
            else:
                cv2.putText(frame, f"Dynamic: waiting for buffer...",
                           (panel_x + 10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (100, 100, 100), 1)
            y_pos += 30
            
            # Keybinding hints
            cv2.putText(frame, "k/j: threshold +/- 0.01 | K/J: +/- 0.05",
                       (panel_x + 10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (150, 150, 150), 1)
            
            # Motion bar (visual indicator)
            bar_x = 20
            bar_y = frame.shape[0] - 60
            bar_width = 300
            bar_height = 30
            
            # Background
            cv2.rectangle(frame, (bar_x, bar_y), 
                         (bar_x + bar_width, bar_y + bar_height), 
                         (80, 80, 80), -1)
            
            # Fill (scale to 0.5 max for display)
            max_display_motion = 0.3
            fill_ratio = min(1.0, wrist_motion / max_display_motion)
            fill_width = int(bar_width * fill_ratio)
            
            if is_dynamic_motion:
                fill_color = (0, 255, 0)  # Green when above threshold
            else:
                fill_color = (100, 100, 100)  # Gray when below
            
            if fill_width > 0:
                cv2.rectangle(frame, (bar_x, bar_y), 
                             (bar_x + fill_width, bar_y + bar_height), 
                             fill_color, -1)
            
            # Threshold line (red)
            threshold_x = bar_x + int(bar_width * (WRIST_MOTION_THRESHOLD / max_display_motion))
            cv2.line(frame, (threshold_x, bar_y), 
                    (threshold_x, bar_y + bar_height), 
                    (0, 0, 255), 2)
            
            # Border
            cv2.rectangle(frame, (bar_x, bar_y), 
                         (bar_x + bar_width, bar_y + bar_height), 
                         (200, 200, 200), 2)
        
        else:
            # No hand detected - clear buffers
            landmark_buffer.clear()
            wrist_buffer.clear()
            cv2.putText(frame, "Show hand",
                       (50, 120), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 4)
        
        cv2.imshow('NGT Recognizer', frame)
        
        key = cv2.waitKey(1) & 0xFF
        
        # Vim-style threshold adjustment
        if key == ord('k'):
            WRIST_MOTION_THRESHOLD += 0.01
            print(f"Threshold increased to {WRIST_MOTION_THRESHOLD:.3f}")
        elif key == ord('j'):
            WRIST_MOTION_THRESHOLD = max(0.0, WRIST_MOTION_THRESHOLD - 0.01)
            print(f"Threshold decreased to {WRIST_MOTION_THRESHOLD:.3f}")
        elif key == ord('K'):
            WRIST_MOTION_THRESHOLD += 0.05
            print(f"Threshold increased to {WRIST_MOTION_THRESHOLD:.3f}")
        elif key == ord('J'):
            WRIST_MOTION_THRESHOLD = max(0.0, WRIST_MOTION_THRESHOLD - 0.05)
            print(f"Threshold decreased to {WRIST_MOTION_THRESHOLD:.3f}")
        elif key == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


NGT Fingerspelling Recognizer
Rolling window size: 5 frames
Press 'q' to quit

