# Realtime Hand Sign translation with CNN

## Initial setup - imports, loading data, utility functions

In [None]:
%pip install mediapipe opencv-python matplotlib tensorflow

In [7]:
# imports for data processing
import os
import mediapipe as mp
import numpy as np
import cv2

In [22]:
# data dir paths
unprocessed_data_dir = 'data/unprocessed'
processed_data_dir = 'data/processed'

# media pipe hands object
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    static_image_mode=False, min_detection_confidence=0.9, min_tracking_confidence=0.9
)

In [12]:
# function to create image with landmarks
def get_hand_landmarks_image(frame):
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False

    # draw landmarks on blank 224x224 white image
    output_img = np.ones((224, 224, 3), np.uint8) * 255

    try:
        results = hands.process(image)
    except ValueError:
        return False, output_img

    mp_drawing = mp.solutions.drawing_utils

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                output_img,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing.DrawingSpec(color=(20,20,20), thickness=3, circle_radius=1),
                connection_drawing_spec=mp_drawing.DrawingSpec(color=(20,20,20), thickness=3, circle_radius=1),
            )
    return not results.multi_hand_landmarks == None, output_img

### Create data with landmarked hand images from data/unprocessed (Optional)

In [24]:
# loop through all directores in unprocessed_data_dir
for dir in os.listdir(unprocessed_data_dir):
    print(f'Processing {dir}...', end='')

    # create output dir
    output_dir = os.path.join(processed_data_dir, dir)
    os.makedirs(output_dir, exist_ok=True)
    
    # loop through all files in dir
    for file in os.listdir(os.path.join(unprocessed_data_dir, dir)):
        # load image
        image = cv2.imread(os.path.join(unprocessed_data_dir, dir, file))
        
        # process image
        ret, processed_image = get_hand_landmarks_image(image)

        # save image
        if ret:
            cv2.imwrite(os.path.join(output_dir, file), processed_image)
        else:
            print(f'  No hands detected in {dir}/{file}')
        
    print(f'Done!')
        
# release hands object
if hands: hands.close()

Processing 8...Done!
Processing 9...Done!


In [13]:
# create blank image when no hands are detected
output_img = np.ones((224, 224, 3), np.uint8) * 255
for i in range(1000):
    cv2.imwrite(os.path.join('data', 'processed', '26', f'{i}.jpg'), output_img)

## ASL CNN model

### Define and train model

In [25]:
# imports for model training/testing
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [26]:
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = train_datagen.flow_from_directory(
    'data/processed',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training')

validation_generator = train_datagen.flow_from_directory(
    'data/processed',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation')


Found 21400 images belonging to 27 classes.
Found 5348 images belonging to 27 classes.


In [27]:
# define model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(256, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(27, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [28]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
model.fit(train_generator, validation_data=validation_generator, epochs=10)

Epoch 1/10


  self._warn_if_super_not_called()


[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m469s[0m 696ms/step - accuracy: 0.7347 - loss: 0.9589 - val_accuracy: 0.9576 - val_loss: 0.2043
Epoch 2/10
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 586ms/step - accuracy: 0.9903 - loss: 0.0281 - val_accuracy: 0.9669 - val_loss: 0.1708
Epoch 3/10
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 580ms/step - accuracy: 0.9933 - loss: 0.0274 - val_accuracy: 0.9721 - val_loss: 0.1787
Epoch 4/10
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 591ms/step - accuracy: 0.9964 - loss: 0.0130 - val_accuracy: 0.9708 - val_loss: 0.1713
Epoch 5/10
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 576ms/step - accuracy: 0.9952 - loss: 0.0156 - val_accuracy: 0.9742 - val_loss: 0.1853
Epoch 6/10
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 577ms/step - accuracy: 0.9960 - loss: 0.0164 - val_accuracy: 0.9669 - val_loss: 0.1938
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x1d2e1fec1d0>

In [30]:
model.save('model3.keras')

### Test model

In [31]:
# load model
model = load_model('model2.keras')

#### Test with static image

In [50]:
# test image
image = cv2.imread('data/processed/5/0.jpg')

added_dim_img = np.expand_dims(image, axis=0)

prediction = model.predict(added_dim_img)
predicted_class = np.argmax(prediction)

print(f'Predicted class: {predicted_class}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
Predicted class: 22


#### Test with realtime video from webcam

In [41]:
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    # Preprocess the frame to match the input size of the model and scale the pixel values
    frame = cv2.resize(frame, (224, 224))

    ret, hand_landmarks_img = get_hand_landmarks_image(frame)
    
    # Add an extra dimension because the model expects batches of images
    added_dim_img = np.expand_dims(hand_landmarks_img, axis=0)

    # Use the model to predict the class of the frame
    prediction = model.predict(added_dim_img)

    # Get the class with the highest probability
    predicted_class = np.argmax(prediction)

    # Display the predicted class on the frame and print it
    cv2.putText(hand_landmarks_img, chr(65 + predicted_class), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the resulting frame
    cv2.imshow('Hand Landmarks', hand_landmarks_img)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57

In [15]:
cap.release()
cv2.destroyAllWindows()