# Realtime Hand Sign translation with CNN

## Initial setup - imports, loading data, utility functions

In [None]:
%pip install mediapipe opencv-python matplotlib tensorflow

In [29]:
# imports for data processing
import os
import mediapipe as mp
import numpy as np
import cv2

In [33]:
# data dir paths
unprocessed_data_dir = 'data/unprocessed'
processed_data_dir = 'data/processed'

# media pipe hands object
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    static_image_mode=False, min_detection_confidence=0.9, min_tracking_confidence=0.9
)

In [30]:
# function to create image with landmarks
def get_hand_landmarks_image(frame):
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # draw landmarks on blank 224x224 white image
    output_img = np.ones((224, 224, 3), np.uint8) * 255

    results = hands.process(image_rgb)

    mp_drawing = mp.solutions.drawing_utils

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                output_img,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing.DrawingSpec(color=(20,20,20), thickness=2, circle_radius=1),
                connection_drawing_spec=mp_drawing.DrawingSpec(color=(20,20,20), thickness=2, circle_radius=1),
            )
    return not results.multi_hand_landmarks == None, output_img

### Create data with landmarked hand images from data/unprocessed (Optional)

In [91]:
# loop through all directores in unprocessed_data_dir
for dir in os.listdir(unprocessed_data_dir):
    print(f'Processing {dir}...', end='')

    # create output dir
    output_dir = os.path.join(processed_data_dir, dir)
    os.makedirs(output_dir, exist_ok=True)
    
    # loop through all files in dir
    for file in os.listdir(os.path.join(unprocessed_data_dir, dir)):
        # load image
        image_rgb = cv2.imread(os.path.join(unprocessed_data_dir, dir, file))
        
        # process image
        ret, processed_image = get_hand_landmarks_image(image_rgb)

        # save image
        if ret:
            cv2.imwrite(os.path.join(output_dir, file), processed_image)
        else:
            print(f'  No hands detected in {dir}/{file}')
        
    print(f'Done!')

Processing 14...Done!
Processing 21...Done!


In [13]:
# create blank image when no hands are detected
output_img = np.ones((224, 224, 3), np.uint8) * 255
for i in range(1000):
    cv2.imwrite(os.path.join('data', 'processed', '26', f'{i}.jpg'), output_img)

## ASL CNN model

### Define and train model

In [31]:
# imports for model training/testing
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [34]:
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = train_datagen.flow_from_directory(
    processed_data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training')

validation_generator = train_datagen.flow_from_directory(
    processed_data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation')

# class labels
class_labels = train_generator.class_indices
class_labels = {v: k for k, v in class_labels.items()}
print('Class labels:', class_labels)

Found 21400 images belonging to 27 classes.
Found 5348 images belonging to 27 classes.
Class labels: {0: '0', 1: '1', 2: '10', 3: '11', 4: '12', 5: '13', 6: '14', 7: '15', 8: '16', 9: '17', 10: '18', 11: '19', 12: '2', 13: '20', 14: '21', 15: '22', 16: '23', 17: '24', 18: '25', 19: '26', 20: '3', 21: '4', 22: '5', 23: '6', 24: '7', 25: '8', 26: '9'}


In [35]:
# define model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(256, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(26, activation='softmax')
])

In [36]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [96]:
model.fit(train_generator, validation_data=validation_generator, epochs=10)

Epoch 1/10


  self._warn_if_super_not_called()


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 673ms/step - accuracy: 0.8079 - loss: 0.6264 - val_accuracy: 1.0000 - val_loss: 8.2426e-06
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 575ms/step - accuracy: 1.0000 - loss: 6.6047e-06 - val_accuracy: 1.0000 - val_loss: 3.9736e-09
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 579ms/step - accuracy: 1.0000 - loss: 1.3317e-07 - val_accuracy: 1.0000 - val_loss: 4.3710e-09
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 582ms/step - accuracy: 1.0000 - loss: 3.9022e-05 - val_accuracy: 1.0000 - val_loss: 4.8081e-08
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 595ms/step - accuracy: 0.9987 - loss: 0.0111 - val_accuracy: 1.0000 - val_loss: 2.5431e-06
Epoch 6/10
[1m12/75[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m34s[0m 543ms/step - accuracy: 1.0000 - loss: 8.2774e-06

KeyboardInterrupt: 

In [97]:
model.save('model3.keras')

### Test model

In [37]:
# load model
model = load_model('model2.keras')

#### Test with static image

In [38]:
# test image
image_rgb = cv2.imread(processed_data_dir + '/25/500.jpg')

added_dim_img = np.expand_dims(image_rgb, axis=0)

prediction = model.predict(added_dim_img)
predicted_class = np.argmax(prediction)

print(f'Predicted class: {class_labels[predicted_class]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Predicted class: 25


#### Test with realtime video from webcam

In [39]:
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret: continue

    # Preprocess the frame to match the input size of the model and scale the pixel values
    frame = frame[:, 80:560]
    frame = cv2.resize(frame, (224, 224))

    # get hand landmarks image with mediapipe
    is_hands_detected, hand_landmarks_img = get_hand_landmarks_image(frame)
    
    # Add an extra dimension because the model expects batches of images
    added_dim_img = np.expand_dims(hand_landmarks_img, axis=0)

    # Use the model to predict the class of the frame
    prediction = model.predict(added_dim_img)

    # Get the class with the highest probability
    predicted_class = np.argmax(prediction)
    predicted_class_label = class_labels[predicted_class]
    
    # Display the predicted class on the frame and print it
    sign = ' ' if predicted_class_label == '26' else chr(65 + int(predicted_class_label))
    cv2.putText(hand_landmarks_img, sign, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the resulting frame
    cv2.imshow('Hand Landmarks', hand_landmarks_img)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27

In [135]:
cap.release()
cv2.destroyAllWindows()