Importing Libraries

In [60]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import pandas as pd
import time
from collections import deque
from sklearn.preprocessing import LabelEncoder

Model loading

In [50]:
mobilenet_model = tf.keras.models.load_model("sign_language_model_MobileNetV2.h5")
mlp_model = tf.keras.models.load_model("asl_mediapipe_mlp_model.h5")


Reading the mediapipe keypoints and defining the class labels

In [51]:
df = pd.read_csv("asl_mediapipe_keypoints_dataset.csv")
encoder = LabelEncoder()
encoder.fit(df["label"]) 

# Correct class labels
class_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
                'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']

Mediapipe for hand keypoints

In [52]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)

I0000 00:00:1739436717.996646       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro


Define the height and width of the box, where mobilenet is to be predicting classes

In [53]:
HEIGHT_EXPAND = 220
WIDTH_EXPAND = 150

In [None]:
# Store the predicted sentence
predicted_sentence = ""
last_predicted_label = None
last_prediction_time = 0 

# 5 seconds cooldown for repeated letters
cooldown_time = 5 

In [None]:
def extract_landmark_features(hand_landmarks, handedness):
    """
    Extracts and normalizes 21 hand landmarks from MediaPipe.
    If the hand is right-handed, mirror it to match left-hand training data.
    """
    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])

    # Flip x-coordinates for right hand to match training data
    if handedness.classification[0].label == "Right":
        landmarks[:, 0] = 1 - landmarks[:, 0]

    return landmarks.flatten().reshape(1, -1)

Main Logic for comparision of confidence score of both models - MobileNet V2 and Mediapipe

In [None]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip frame for a mirrored effect
    frame = cv2.flip(frame, 1)

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            ### MLP Prediction (Landmark-based)
            landmark_features = extract_landmark_features(hand_landmarks, handedness)
            mlp_pred = mlp_model.predict(landmark_features)
            mlp_class_index = np.argmax(mlp_pred)
            mlp_confidence = mlp_pred[0][mlp_class_index]
            mlp_label = encoder.inverse_transform([mlp_class_index])[0]

            ### Bounding Box Extraction for MobileNetV2
            x_min = min([lm.x for lm in hand_landmarks.landmark]) * frame.shape[1]
            y_min = min([lm.y for lm in hand_landmarks.landmark]) * frame.shape[0]
            x_max = max([lm.x for lm in hand_landmarks.landmark]) * frame.shape[1]
            y_max = max([lm.y for lm in hand_landmarks.landmark]) * frame.shape[0]

            x_min = max(0, int(x_min - WIDTH_EXPAND))   
            y_min = max(0, int(y_min - HEIGHT_EXPAND)) 
            x_max = min(frame.shape[1], int(x_max + WIDTH_EXPAND))  
            y_max = min(frame.shape[0], int(y_max + HEIGHT_EXPAND)) 

            hand_crop = frame[y_min:y_max, x_min:x_max]

            ### MobileNetV2 Prediction (Image-based)
            if hand_crop.shape[0] > 0 and hand_crop.shape[1] > 0:
                hand_resized = cv2.resize(hand_crop, (128, 128))
                hand_resized = np.expand_dims(hand_resized, axis=0) / 255.0  

                mobilenet_pred = mobilenet_model.predict(hand_resized)
                mobilenet_class_index = np.argmax(mobilenet_pred)
                mobilenet_confidence = mobilenet_pred[0][mobilenet_class_index]
                mobilenet_label = class_labels[mobilenet_class_index]

            ### Decision Fusion: Pick Most Confident Prediction
            if mobilenet_confidence > mlp_confidence:
                final_label = mobilenet_label
                final_confidence = mobilenet_confidence
            else:
                final_label = mlp_label
                final_confidence = mlp_confidence

            ### Logic to Prevent Repeated Predictions
            current_time = time.time()
            if final_label == last_predicted_label:
                if current_time - last_prediction_time < cooldown_time:
                    final_label = None  # Ignore repeated prediction
            else:
                last_predicted_label = final_label
                last_prediction_time = current_time

            ### Sentence Formation Logic
            if final_label and final_label not in ["nothing", "del", "space"]:
                predicted_sentence += final_label
            elif final_label == "space":
                predicted_sentence += " "
            elif final_label == "del":
                predicted_sentence = predicted_sentence[:-1]  # Remove last character

            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            if final_label:
                cv2.putText(frame, f"{final_label} ({final_confidence:.2f})", (x_min, y_min - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Create a black bar for displaying sentence
    bar_height = 60
    frame_height, frame_width, _ = frame.shape
    cv2.rectangle(frame, (0, frame_height - bar_height), (frame_width, frame_height), (0, 0, 0), -1)
    cv2.putText(frame, predicted_sentence, (50, frame_height - 20),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    cv2.imshow("Sign Language Recognition (MediaPipe + MobileNetV2)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



2025-02-13 14:22:08.566516: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-13 14:22:08.840085: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




From the experiment above it can be concluded that models based on convolutional neural networks aren't able to predict the signs correctly and even when combined with predictions of mediapipe multi-level perceptron, it still gives the wrong output.

Therefore fine tuning the mediapipe will be the best option available in the market.

Fine tuning the Mediapipe Model

In [64]:
mlp_model = tf.keras.models.load_model("asl_mediapipe_mlp_model.h5")

In [65]:
df = pd.read_csv("asl_mediapipe_keypoints_dataset.csv")
encoder = LabelEncoder()
encoder.fit(df["label"]) 

In [66]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7, max_num_hands=2)  # Allow max 2 hands

I0000 00:00:1739437709.666773       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro


In [None]:
# Sentence formation logic
predicted_sentence = ""
last_predicted_label = None
last_prediction_time = 0

# 5 seconds cooldown for repeated letters
cooldown_time = 5 


In [None]:
# Stabilization buffer
# Stores last 5 predictions
stabilization_window = deque(maxlen=5)

# Must match for 4 out of 5 frames
stabilization_threshold = 4

two_hands_detected = False

In [69]:
def extract_landmark_features(hand_landmarks, handedness):
    """Extract and normalize 21 hand landmarks from MediaPipe."""
    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])

    # Flip x-coordinates for right hand to match left-hand training data
    if handedness.classification[0].label == "Right":
        landmarks[:, 0] = 1 - landmarks[:, 0]

    return landmarks.flatten().reshape(1, -1)

What is fine tuned

- Making of a buffer ensuring that model doesnt predicts images in between the hand sign change
- only predic when there is one hand in the frame, when 2 hand appear, give an warning

In [None]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip frame for mirrored effect
    frame = cv2.flip(frame, 1)

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    # Check if two hands are detected
    if results.multi_hand_landmarks and len(results.multi_hand_landmarks) > 1:
        two_hands_detected = True  # Set warning flag
    else:
        two_hands_detected = False  # Reset warning flag

    if results.multi_hand_landmarks and not two_hands_detected:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            ### MLP Prediction (Landmark-based)
            landmark_features = extract_landmark_features(hand_landmarks, handedness)
            mlp_pred = mlp_model.predict(landmark_features)
            mlp_class_index = np.argmax(mlp_pred)
            mlp_label = encoder.inverse_transform([mlp_class_index])[0]

            # Add prediction to stabilization buffer
            stabilization_window.append(mlp_label)

            # Check if stabilization condition is met (consistent prediction)
            if stabilization_window.count(mlp_label) >= stabilization_threshold:
                
                ### Loigc to Prevent Repeated Predictions
                current_time = time.time()
                if mlp_label == last_predicted_label:
                    if current_time - last_prediction_time < cooldown_time:
                        mlp_label = None  # Ignore repeated prediction
                else:
                    last_predicted_label = mlp_label
                    last_prediction_time = current_time

                ### Sentence Formation Logic
                if mlp_label and mlp_label not in ["nothing", "del", "space"]:
                    predicted_sentence += mlp_label
                elif mlp_label == "space":
                    predicted_sentence += " "
                elif mlp_label == "del":
                    predicted_sentence = predicted_sentence[:-1]

    bar_height = 60
    frame_height, frame_width, _ = frame.shape
    cv2.rectangle(frame, (0, frame_height - bar_height), (frame_width, frame_height), (0, 0, 0), -1)

    # Display Warning if Two Hands Detected
    if two_hands_detected:
        cv2.putText(frame, "Only One Hand Allowed!", (50, frame_height - 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    else:
        cv2.putText(frame, predicted_sentence, (50, frame_height - 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    cv2.imshow("Sign Language Recognition (Fine Tuned MediaPipe)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



2025-02-13 14:38:50.632853: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




: 