# Sign Langauge Prediction

In [1]:
import cv2
import numpy as np
import csv
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import pandas as pd
from random import sample
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import zipfile
import json
import joblib

In [2]:
word_classes = np.array([
    'a', 'about', 'aim', 'all', 'and', 'audio', 'b', 'barrier', 'break', 
    'c', 'can', 'communication', 'creative', 'd', 'detect', 'developed', 
    'e', 'f', 'g', 'h', 'have', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'our', 
    'p', 'project', 'q', 'r', 's', 'sign language', 'solution', 't', 'team', 
    'text', 'that', 'to', 'translate', 'u', 'v', 'w', 'what', 'x', 'y', 'you', 'z'
])


In [3]:
# Initialize MediaPipe Hands and Pose
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

In [4]:
# Initialize MediaPipe Hands and Pose
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=2,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)

In [5]:
model_save_path = "models/my_vit_model.h5"

In [6]:
# Load the saved model
model = tf.keras.models.load_model(model_save_path)

# Verify the model structure
model.summary()




In [7]:
# Update constants
NUM_HANDS = 2
NUM_LANDMARKS = 21
FEATURES = 3  # x, y, z
TIMESTEPS = 50

In [8]:
# Zero placeholder
ZERO_LANDMARK = [[0, 0, 0]] * NUM_LANDMARKS

In [9]:
def get_landmarks(frame_rgb):
    """
    Process the frame to extract hand and pose landmarks.
    :param frame_rgb: RGB frame from video
    :return: Hand landmarks, Pose landmarks
    """
    hand_results = hands.process(frame_rgb)
    return hand_results

In [10]:
def draw_landmarks(frame, hand_results):
    """
    Draw hand and pose landmarks on the frame.
    :param frame: Original frame from video
    :param hand_results: Hand landmarks from MediaPipe
    :return: Frame with drawn landmarks
    """
    # Draw hand landmarks
    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2),
                mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2)
            )

    return frame

In [11]:
# Load the saved scaler
scaler = joblib.load(scaler_save_path)
print("Scaler loaded.")

In [12]:
def preprocess_landmarks(captured_landmarks):
    """
    Normalize landmarks and maintain 3D structure for model input.
    """
    # Convert to a numpy array
    captured_landmarks = np.array(captured_landmarks)  # Shape: (50, 2, 21, 3)
    
    # Reshape to (timesteps, features) where features = NUM_HANDS * NUM_LANDMARKS * 3
    reshaped_landmarks = captured_landmarks.reshape(TIMESTEPS, NUM_HANDS * NUM_LANDMARKS * 3)
    
    # Normalize between 0 and 1 (optional)
    normalized_landmarks = reshaped_landmarks / np.max(reshaped_landmarks)
    
    return normalized_landmarks


In [13]:

# Data capture variables
captured_landmarks = []  # List to store landmarks for all frames
frame_counter = 0  # Tracks the number of frames collected
recording = False  # Indicates when to start recording frames
hands_detected_once = False  # Flag to track if hands were detected at least once

In [14]:
# Video capture
cap = cv2.VideoCapture(0)


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame for a mirrored view
    frame = cv2.flip(frame, 1)

    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Get landmarks
    hand_results = get_landmarks(frame_rgb)

    # Draw landmarks on the frame
    frame = draw_landmarks(frame, hand_results)

    # Check if hands are detected
    if hand_results.multi_hand_landmarks:
        if not recording and not hands_detected_once:
            # Start recording if hands are detected
            print("Hands detected. Starting to collect frames.")
            hands_detected_once = True
            recording = True

        # Extract landmarks for detected hands
        frame_landmarks = [ZERO_LANDMARK] * NUM_HANDS
        for idx, hand_landmark in enumerate(hand_results.multi_hand_landmarks):
            if idx < NUM_HANDS:
                frame_landmarks[idx] = [[lm.x, lm.y, lm.z] for lm in hand_landmark.landmark]

        # Append frame landmarks
        captured_landmarks.append(frame_landmarks)
        frame_counter += 1
    elif recording:
        # If no hands are detected, use zero matrix
        captured_landmarks.append([ZERO_LANDMARK] * NUM_HANDS)
        frame_counter += 1

    if recording and frame_counter >= TIMESTEPS:
        # Preprocess collected landmarks
        preprocessed_input = preprocess_landmarks(captured_landmarks)

        # Add batch dimension for prediction
        preprocessed_input = np.expand_dims(preprocessed_input, axis=0)

        # Predict
        prediction = model.predict(preprocessed_input)
        predicted_class = np.argmax(prediction)
        confidence = np.max(prediction)

        # Ensure the predicted_class is within the valid range
        print(f"Predicted class index: {predicted_class}, Number of classes: {len(word_classes)}")

        # Access the predicted word
        if predicted_class < len(word_classes):
            predicted_word = word_classes[predicted_class]
            print(predicted_word)
        else:
            print("Error: predicted_class out of bounds.")

        # Display the prediction
        cv2.putText(frame, f"Predicted: {predicted_class}, Confidence: {confidence:.2f}",
                    (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

        # Reset data for the next cycle
        captured_landmarks = []
        frame_counter = 0
        recording = False
        hands_detected_once = False
        time.sleep(3)  # Brief pause before next collection cycle

    # Show the frame
    cv2.imshow('Hand Gesture Prediction', frame)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()

Hands detected. Starting to collect frames.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
Predicted class index: 28, Number of classes: 50
our
Hands detected. Starting to collect frames.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Predicted class index: 23, Number of classes: 50
k
Hands detected. Starting to collect frames.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Predicted class index: 28, Number of classes: 50
our
