In [1]:
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import load_model
import time
from collections import Counter
from autocorrect import Speller  # For language correction

In [2]:
# Parameters matching your training pipeline
BODY_KEYPOINTS = 33 * 3  # (x, y, z) for each body keypoint
HAND_KEYPOINTS = 42 * 3  # (x, y, z) for each hand keypoint
MAX_LEN = BODY_KEYPOINTS + HAND_KEYPOINTS  # should be 126

In [3]:
@tf.keras.utils.register_keras_serializable()
def sum_over_time(x):
    return tf.keras.backend.sum(x, axis=1)

In [None]:
# Load your trained model
model = load_model("../model/lstm_cnn_model.keras", custom_objects={"sum_over_time": sum_over_time})

In [5]:
# Load label mapping (assuming it was saved as a numpy array)
label_classes = np.load("../label_encoder.npy", allow_pickle=True)
# Create a mapping from index to label string.
idx2label = {i: label for i, label in enumerate(label_classes)}

In [6]:
# Initialize Mediapipe pose and hands detectors
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False,
                    min_detection_confidence=0.5,
                    min_tracking_confidence=0.5)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=2,
                       min_detection_confidence=0.5)

In [7]:
# Helper function: Normalize keypoints from a list of dictionaries
def normalize_keypoints(body, hands_kps):
    """
    Convert the detected keypoints into a fixed-length array.
    
    Args:
        body (list): List of body keypoints dictionaries, each with keys "x", "y", "z".
        hands_kps (list): List of hand keypoints dictionaries, each with keys "x", "y", "z".
    
    Returns:
        np.array: 1D numpy array of length MAX_LEN.
    """
    keypoints = []
    
    # Process body keypoints (expected 33)
    if body and len(body) > 0:
        for point in body:
            keypoints.extend([point.get("x", 0.0), point.get("y", 0.0), point.get("z", 0.0)])
    else:
        keypoints.extend([0] * BODY_KEYPOINTS)
        
    # Process hand keypoints (expected 42)
    if hands_kps and len(hands_kps) > 0:
        for point in hands_kps:
            keypoints.extend([point.get("x", 0.0), point.get("y", 0.0), point.get("z", 0.0)])
    else:
        keypoints.extend([0] * HAND_KEYPOINTS)
    
    # Ensure the keypoints vector is of fixed length
    if len(keypoints) < MAX_LEN:
        keypoints += [0] * (MAX_LEN - len(keypoints))
    elif len(keypoints) > MAX_LEN:
        keypoints = keypoints[:MAX_LEN]
    
    # Optionally, you can add normalization here (e.g., min-max scaling).
    # For simplicity, we'll assume the raw keypoints are acceptable.
    return np.array(keypoints, dtype=np.float32)

In [8]:
# Function to extract keypoints from a frame using Mediapipe.
# This function now returns (normalized_keypoints, hand_present_flag)
def extract_keypoints_from_frame(frame):
    # Convert the frame to RGB
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process with pose
    pose_results = pose.process(image_rgb)
    body_keypoints = []
    if pose_results.pose_landmarks:
        for lm in pose_results.pose_landmarks.landmark:
            body_keypoints.append({
                "x": lm.x,
                "y": lm.y,
                "z": lm.z
            })
    
    # Process with hands
    hands_results = hands.process(image_rgb)
    hands_keypoints = []
    if hands_results.multi_hand_landmarks:
        for hand_landmarks in hands_results.multi_hand_landmarks:
            for lm in hand_landmarks.landmark:
                hands_keypoints.append({
                    "x": lm.x,
                    "y": lm.y,
                    "z": lm.z
                })
    
    # If no hand keypoints are detected, mark hand_present as False.
    hand_present = True if hands_keypoints and len(hands_keypoints) > 0 else False
    
    normalized = normalize_keypoints(body_keypoints, hands_keypoints)
    return normalized, hand_present

In [9]:
# --- Variables for Word and Sentence Formation (if needed) ---
window_size = 10                     # Number of frames for smoothing
prediction_buffer = []               # Buffer to hold recent predictions for majority vote
current_word = ""                    # Current word being formed
sentence = ""                        # Full sentence
no_keypoint_count = 0                # Count of consecutive frames with "No Keypoints"
no_keypoint_threshold = 15           # If exceeded, treat as a break between words

In [None]:
# Start webcam capture
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

print("Starting real-time prediction. Press 'q' to exit.")
prev_time = time.time()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Optionally, flip the image for a mirror effect
    frame = cv2.flip(frame, 1)
    
    # Extract keypoints and hand flag from the frame
    keypoints_vector, hand_present = extract_keypoints_from_frame(frame)
    
    # If no hand is present, skip prediction (or display a message)
    if not hand_present:
        predicted_label = "No Hand Detected"
        # Reset prediction buffer if desired
        prediction_buffer = []
    else:
        # Reshape to match the model's expected input shape: (1, MAX_LEN, 1)
        input_data = keypoints_vector.reshape(1, MAX_LEN, 1)
        pred_prob = model.predict(input_data)
        pred_class = np.argmax(pred_prob, axis=1)[0]
        current_prediction = idx2label.get(pred_class, "Unknown")
        
        # Append prediction to the buffer
        prediction_buffer.append(current_prediction)
        if len(prediction_buffer) > window_size:
            prediction_buffer.pop(0)
        
        # Compute majority vote over the buffer for smoothing
        predicted_label = Counter(prediction_buffer).most_common(1)[0][0]
    
    # Calculate FPS
    current_time = time.time()
    fps = 1 / (current_time - prev_time)
    prev_time = current_time
    
    # Display prediction and FPS on the frame
    cv2.putText(frame, f"Prediction: {predicted_label}", (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, f"FPS: {int(fps)}", (10, 70), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
    
    # Show the frame
    cv2.imshow("Real-Time Prediction", frame)
    
    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()