In [3]:
pip install opencv-python numpy mediapipe tensorflow pyttsx3




In [None]:
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
import pyttsx3

# Load the trained model
model = tf.keras.models.load_model("Deep learning Model For sign.h5")

# Define the label map (Ensure it matches the dataset used in training)
label_map = {i: chr(65 + i) for i in range(26)}  # A-Z

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5)
mp_draw = mp.solutions.drawing_utils

# Initialize text-to-speech engine with more realistic settings
engine = pyttsx3.init()

# Set speech rate (Lower value = slower speech)
engine.setProperty("rate", 120)  # Adjust for natural speech

# Set voice (Choose a more human-like voice)
voices = engine.getProperty("voices")
for voice in voices:
    if "female" in voice.name.lower():  # Prioritize a female voice (optional)
        engine.setProperty("voice", voice.id)
        break
else:
    engine.setProperty("voice", voices[0].id)  # Fallback to the first available voice

# Variables to store the current letter and final word
current_letter = ""  # Store the latest predicted letter
final_word = ""  # Store the manually added word
hand_present = False  # Track if the hand is visible

def extract_hand_landmarks(image):
    """
    Extract (x, y) landmarks from the detected hand.
    Also returns the bounding box coordinates of the hand.
    """
    img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            x_min, y_min = float('inf'), float('inf')
            x_max, y_max = float('-inf'), float('-inf')

            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * image.shape[1]), int(lm.y * image.shape[0])
                landmarks.append(lm.x)  # X-coordinate
                landmarks.append(lm.y)  # Y-coordinate

                # Update bounding box
                x_min, y_min = min(x_min, x), min(y_min, y)
                x_max, y_max = max(x_max, x), max(y_max, y)

            return np.array(landmarks), (x_min, y_min, x_max, y_max)  # Return landmarks and bounding box

    return None, None  # No hand detected

# Start video capture
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally
    frame = cv2.flip(frame, 1)

    # Extract hand landmarks and bounding box
    landmarks, bbox = extract_hand_landmarks(frame)

    if landmarks is not None:
        hand_present = True  # Hand is visible

        # Reshape for model input
        input_data = np.array(landmarks).reshape(1, -1)  # Shape should match training data

        # Predict using model
        prediction = model.predict(input_data)
        predicted_index = np.argmax(prediction)

        # Check if confidence is too low (handle "nothing" case)
        if np.max(prediction) < 0.5:  # Adjust confidence threshold as needed
            predicted_letter = " "
        else:
            predicted_letter = label_map.get(predicted_index, " ")

        # Store the current predicted letter
        current_letter = predicted_letter

        # Display prediction
        cv2.putText(frame, f'Prediction: {predicted_letter}', (50, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        # Draw landmarks on hand
        for i in range(0, len(landmarks), 2):
            x, y = int(landmarks[i] * frame.shape[1]), int(landmarks[i + 1] * frame.shape[0])
            cv2.circle(frame, (x, y), 5, (0, 0, 255), -1)

        # Draw bounding box around the hand
        if bbox:
            x_min, y_min, x_max, y_max = bbox
            cv2.rectangle(frame, (x_min - 10, y_min - 10), (x_max + 10, y_max + 10), (255, 0, 0), 2)

    else:
        # If hand disappears, speak the final word and reset
        if hand_present and final_word.strip():  # Avoid speaking if empty
            print("Speaking:", final_word)
            engine.say(final_word)
            engine.runAndWait()
            final_word = ""  # Reset the stored word

        hand_present = False  # Hand is not visible

    # Store the letter when "S" is pressed
    key = cv2.waitKey(1) & 0xFF
    if key == ord('s'):
        final_word += current_letter  # Add the letter (or space) to the word
        print("Stored Word:", final_word)

    # Show the final word on screen
    cv2.putText(frame, f'Word: {final_word}', (50, 100),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA)

    # Show the output frame
    cv2.imshow("Sign Language Detection", frame)

    # Press 'q' to quit
    if key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Stored Word: H
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m