In [None]:
# Importing OpenCV for image and video processing
import cv2

# Importing the os module to interact with the operating system
# This is useful for file and directory management, such as checking for existing files or creating directories.
import os

# Importing NumPy for numerical operations
# NumPy is primarily used for handling arrays and performing mathematical operations on them.
import numpy as np

# Importing TensorFlow, a deep learning framework
# TensorFlow is used for building and training machine learning models, especially neural networks.
import tensorflow as tf

# Importing img_to_array from Keras for image preprocessing
# This function converts PIL images to NumPy arrays, which can be fed into neural networks.
from tensorflow.keras.preprocessing.image import img_to_array

# Importing MediaPipe for building machine learning pipelines
# MediaPipe is often used for real-time applications like hand tracking, face detection, and pose estimation.
import mediapipe as mp


In [8]:
# Load the trained model
model = tf.keras.models.load_model('sign_language_detection_model.keras')

In [9]:
# Define labels corresponding to your classes
labels = [
    {'name': 'hello', 'id': 1},
    {'name': 'thanks', 'id': 2},
    {'name': 'yes', 'id': 3},
    {'name': 'no', 'id': 4},
    {'name': 'iloveyou', 'id': 5},
]

In [None]:
# Setting up MediaPipe for hand detection
mp_hands = mp.solutions.hands  # Importing the hands module from MediaPipe for hand tracking
hands = mp_hands.Hands(
    min_detection_confidence=0.5,  # Minimum confidence value ([0, 1]) for detecting hands
    min_tracking_confidence=0.5     # Minimum confidence value ([0, 1]) for tracking the hands
)

mp_drawing = mp.solutions.drawing_utils  # Importing drawing utilities from MediaPipe for rendering hand landmarks and connections


In [None]:
# Function to get the label name corresponding to a given label ID
def get_label_name(label_id):
    # Iterate through the list of labels
    for label in labels:
        # Check if the current label's ID matches the provided label ID
        if label['id'] == label_id:
            return label['name']  # Return the name of the matching label
    return "Unknown"  # Return "Unknown" if no matching label ID is found


In [None]:
# Open the webcam
cap = cv2.VideoCapture(0)  # Initialize webcam capture (0 for the default camera)

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()  # Capture a single frame
    if not ret:
        print("Failed to grab frame.")  # Print error if frame capture fails
        break  # Exit loop if frame capture fails

    # Convert the frame to RGB for processing
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)  # Process the RGB frame with MediaPipe hands

    # Check if any hands are detected in the frame
    if results.multi_hand_landmarks:
        # Initialize bounding box coordinates to extremes
        x_min, y_min = frame.shape[1], frame.shape[0]  # Set to max possible values
        x_max, y_max = 0, 0  # Set to min possible values
        
        # Iterate through each detected hand's landmarks
        for hand_landmarks in results.multi_hand_landmarks:
            # Loop through each landmark to find bounding box coordinates
            for landmark in hand_landmarks.landmark:
                x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])  # Convert normalized coordinates
                x_min = min(x_min, x)  # Update minimum x-coordinate
                y_min = min(y_min, y)  # Update minimum y-coordinate
                x_max = max(x_max, x)  # Update maximum x-coordinate
                y_max = max(y_max, y)  # Update maximum y-coordinate

            # Draw the hand landmarks on the frame
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Ensure bounding box coordinates are within the frame bounds
        x_min, y_min = max(0, x_min), max(0, y_min)
        x_max, y_max = min(frame.shape[1], x_max), min(frame.shape[0], y_max)

        # Draw a bounding box around the detected hand
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

        # Extract the region of interest (ROI) for the hand image
        hand_image = frame[y_min:y_max, x_min:x_max]
        if hand_image.size > 0:  # Check if the hand image is valid
            hand_image = cv2.resize(hand_image, (150, 150))  # Resize to match model input size
            hand_image = img_to_array(hand_image) / 255.0  # Normalize pixel values to [0, 1]
            hand_image = np.expand_dims(hand_image, axis=0)  # Expand dimensions for model prediction

            # Debugging: Print the shape and min/max pixel values of the hand image
            print(f"Hand image shape: {hand_image.shape}, min: {hand_image.min()}, max: {hand_image.max()}")

            # Make a prediction using the trained model
            predictions = model.predict(hand_image)
            predicted_class = np.argmax(predictions[0]) + 1  # Get the index of the class with the highest probability
            confidence = predictions[0][predicted_class - 1]  # Get the confidence score for the predicted class
            label_name = get_label_name(predicted_class)  # Map the predicted class ID to a label name

            # Print the prediction and confidence for debugging
            print(f"Prediction: {label_name} with confidence {confidence:.2f}")

            # Display the predicted label and confidence on the frame
            display_text = f"{label_name} ({confidence:.2f})"
            cv2.putText(frame, display_text, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Show the frame with the bounding box and prediction text
    cv2.imshow("Real-Time Sign Language Detection", frame)

    # Exit the loop if the 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close all OpenCV windows
cap.release()  # Release the webcam resource
cv2.destroyAllWindows()  # Close all OpenCV windows


Hand image shape: (1, 150, 150, 3), min: 0.0, max: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
Prediction: hello with confidence 1.00
Hand image shape: (1, 150, 150, 3), min: 0.0, max: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Prediction: hello with confidence 1.00
Hand image shape: (1, 150, 150, 3), min: 0.0, max: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Prediction: hello with confidence 0.99
Hand image shape: (1, 150, 150, 3), min: 0.0, max: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Prediction: hello with confidence 1.00
Hand image shape: (1, 150, 150, 3), min: 0.0, max: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Prediction: hello with confidence 0.95
Hand image shape: (1, 150, 150, 3), min: 0.0, max: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Prediction: hello with confidence 1.00
Han