# Realtime Hand Sign translation with CNN

## Initial setup - imports, loading data, utility functions

In [1]:
# %pip install mediapipe opencv-python matplotlib tensorflow

In [2]:
# imports for data processing
import os
import mediapipe as mp
import numpy as np
import cv2

In [3]:
# data dir paths
unprocessed_data_dir = 'datasets/data/unprocessed'
processed_data_dir = 'datasets/data/processed_landmarked_new'
test_data_dir = 'datasets/data/test_new'

### Functions for preprocessing images

In [4]:
# media pipe hands object
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False, min_detection_confidence=0.9, min_tracking_confidence=0.9
)

# function to create image with landmarks
def get_hand_landmarks_image(frame):
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # draw landmarks on blank 224x224 white image
    output_img = np.ones((224, 224, 3), np.uint8) * 255

    results = hands.process(image_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                output_img,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing.DrawingSpec(color=(20,20,20), thickness=2, circle_radius=2),
                connection_drawing_spec=mp_drawing.DrawingSpec(color=(20,20,20), thickness=2, circle_radius=2),
            )
    return bool(results.multi_hand_landmarks), output_img

# function to get processed image with edge detection of hand by applying grayscale, blur and edge detection
def get_processed_image(frame):
    minValue = 70
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray,(5,5),2)

    th3 = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY_INV,11,2)
    ret, res = cv2.threshold(th3, minValue, 255, cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    res = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)
    return True, res

### Create dataset from webcam

In [None]:
cap = cv2.VideoCapture(0)

# function for preprocessing images
img_processing_func = [get_hand_landmarks_image, get_processed_image][0]

# create dataset for ASL characters A-Z and blank

class_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'blank']
n_images_per_class = 10

for index, label in enumerate(class_labels):
    os.makedirs(os.path.join(processed_data_dir, str(index)), exist_ok=True)
    curr_img = 0
    print(f"Capturing images for {label}...", end='')
    while curr_img < n_images_per_class:
        ret, frame = cap.read()
        if not ret: continue

        # Preprocess the frame to match the input size of the model and scale the pixel values
        frame = frame[:, 80:560]
        frame = cv2.resize(frame, (224, 224))

        # get hand landmarks image with mediapipe
        success, hand_landmarks_img = img_processing_func(frame)

        # save image
        if success:
            cv2.imwrite(os.path.join(processed_data_dir, str(index), f'{curr_img}.jpg'), hand_landmarks_img)
            curr_img += 1

        preview_img = hand_landmarks_img.copy()
        frame_label = f'Finished {label}. press space' if curr_img == n_images_per_class else f'{label} {curr_img}/{n_images_per_class-1}' 
        cv2.putText(preview_img, frame_label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 200), 2, cv2.LINE_AA)
        cv2.imshow('Preview', preview_img)

        if curr_img == n_images_per_class:
            print("Done!")
            while True:
                if cv2.waitKey(1) & 0xFF == ord(' '):
                    break
        
        cv2.waitKey(200)

cap.release()
cv2.destroyAllWindows()

In [20]:
cap.release()
cv2.destroyAllWindows()

### Create data after image processing from unprocessed data (Optional)

In [28]:
# function for preprocessing images
img_processing_func = get_processed_image

# loop through all directores in unprocessed_data_dir
for dir in os.listdir(unprocessed_data_dir):
    print(f'Processing {dir}...', end='')

    # create output dir and test dir
    output_dir = os.path.join(processed_data_dir, dir)
    os.makedirs(output_dir, exist_ok=True)

    # loop through all files in dir
    for file in os.listdir(os.path.join(unprocessed_data_dir, dir)):
        # load image
        image_rgb = cv2.imread(os.path.join(unprocessed_data_dir, dir, file))

        # process image
        ret, processed_image = img_processing_func(image_rgb)

        # save image
        if ret:
            cv2.imwrite(os.path.join(output_dir, file), processed_image)
        else:
            print(f'  Error in pre-processing {dir}/{file}')
    
    print(f'Done!')

Processing 0...Done!
Processing 1...Done!
Processing 10...Done!
Processing 11...Done!
Processing 12...Done!
Processing 13...Done!
Processing 14...Done!
Processing 15...Done!
Processing 16...Done!
Processing 17...Done!
Processing 18...Done!
Processing 19...Done!
Processing 2...Done!
Processing 20...Done!
Processing 21...Done!
Processing 22...Done!
Processing 23...Done!
Processing 24...Done!
Processing 25...Done!
Processing 3...Done!
Processing 4...Done!
Processing 5...Done!
Processing 6...Done!
Processing 7...Done!
Processing 8...Done!
Processing 9...Done!


In [None]:
# create blank image when no hands are detected
n_images = 1000
output_img = np.ones((224, 224, 3), np.uint8) * 255
for i in range(n_images):
    cv2.imwrite(os.path.join(processed_data_dir, '26', f'{i}.jpg'), output_img)

## ASL CNN model

### Define and train model

In [61]:
# imports for model training/testing
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [73]:
# split the data into test, validation train (15 15 70)
import random

# move 15% of files from processed_data_dir to test_data_dir
for dir in os.listdir(processed_data_dir):
    output_dir = os.path.join(test_data_dir, dir)
    os.makedirs(output_dir, exist_ok=True)

    files = os.listdir(os.path.join(processed_data_dir, dir))
    n_files = len(files)
    n_test_files = int(n_files * 0.15)

    random_files = random.sample(files, k=n_test_files)

    for file in random_files:
        os.rename(os.path.join(processed_data_dir, dir, file), os.path.join(test_data_dir, dir, file))

In [80]:
train_datagen = ImageDataGenerator(validation_split=0.1765, rescale=1./255, rotation_range=15, horizontal_flip=True, zoom_range=0.2, height_shift_range=0.2, width_shift_range=0.2, shear_range=0.2)

train_generator = train_datagen.flow_from_directory(
    processed_data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training')

validation_generator = train_datagen.flow_from_directory(
    processed_data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation')

test_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
    test_data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

# class labels
class_labels = train_generator.class_indices
print(class_labels)
class_labels = {v: k for k, v in class_labels.items()}
print('Class labels:', class_labels)

Found 18727 images belonging to 27 classes.
Found 4011 images belonging to 27 classes.
Found 4010 images belonging to 27 classes.
{'0': 0, '1': 1, '10': 2, '11': 3, '12': 4, '13': 5, '14': 6, '15': 7, '16': 8, '17': 9, '18': 10, '19': 11, '2': 12, '20': 13, '21': 14, '22': 15, '23': 16, '24': 17, '25': 18, '26': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26}
Class labels: {0: '0', 1: '1', 2: '10', 3: '11', 4: '12', 5: '13', 6: '14', 7: '15', 8: '16', 9: '17', 10: '18', 11: '19', 12: '2', 13: '20', 14: '21', 15: '22', 16: '23', 17: '24', 18: '25', 19: '26', 20: '3', 21: '4', 22: '5', 23: '6', 24: '7', 25: '8', 26: '9'}


##### Model definition

###### First model

In [75]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(256, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(27, activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [76]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [77]:
model.fit(train_generator, validation_data=validation_generator, epochs=10)

Epoch 1/10


  self._warn_if_super_not_called()


[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 979ms/step - accuracy: 0.1894 - loss: 2.7515 - val_accuracy: 0.8409 - val_loss: 0.5083
Epoch 2/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m471s[0m 800ms/step - accuracy: 0.8274 - loss: 0.5308 - val_accuracy: 0.9309 - val_loss: 0.2016
Epoch 3/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m460s[0m 782ms/step - accuracy: 0.9105 - loss: 0.2657 - val_accuracy: 0.9509 - val_loss: 0.1548
Epoch 4/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m460s[0m 781ms/step - accuracy: 0.9338 - loss: 0.1968 - val_accuracy: 0.9671 - val_loss: 0.1024
Epoch 5/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 806ms/step - accuracy: 0.9510 - loss: 0.1515 - val_accuracy: 0.9781 - val_loss: 0.0636
Epoch 6/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m472s[0m 802ms/step - accuracy: 0.9536 - loss: 0.1347 - val_accuracy: 0.9743 - val_loss: 0.0683
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x1a47839a2d0>

In [78]:
model.save('model5_generalized.keras')

### Test model

In [79]:
# load model
model = load_model('model5_generalized.keras')

#### Test with static image

In [81]:
# test accuracy of model on test data
test_loss, test_acc = model.evaluate(test_generator)
print('Test accuracy:', test_acc)

[1m  1/126[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:00[0m 484ms/step - accuracy: 0.9688 - loss: 0.0776

  self._warn_if_super_not_called()


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 320ms/step - accuracy: 0.9917 - loss: 0.0276
Test accuracy: 0.9910224676132202


In [83]:
# test image
image_rgb = cv2.imread(processed_data_dir + '/20/499.jpg')

added_dim_img = np.expand_dims(image_rgb, axis=0)

prediction = model.predict(added_dim_img)
predicted_class = np.argmax(prediction)

print(f'Predicted class: {class_labels[predicted_class]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
Predicted class: 20


#### Test with realtime video from webcam

In [84]:
cap = cv2.VideoCapture(0)

# function for preprocessing images
img_processing_func = [get_hand_landmarks_image, get_processed_image][0]

while True:
    ret, frame = cap.read()
    if not ret: continue

    # Preprocess the frame to match the input size of the model and scale the pixel values
    frame = frame[:, 80:560]
    frame = cv2.resize(frame, (224, 224))

    # get hand landmarks image with mediapipe
    _, hand_landmarks_img = img_processing_func(frame)
    
    # Add an extra dimension because the model expects batches of images
    added_dim_img = np.expand_dims(hand_landmarks_img, axis=0)

    # Use the model to predict the class of the frame
    prediction = model.predict(added_dim_img)

    # Get the class with the highest probability
    predicted_class = np.argmax(prediction)
    predicted_class_label = class_labels[predicted_class]
    
    # Display the predicted class on the frame and print it
    sign = ' ' if predicted_class_label == '26' else chr(65 + int(predicted_class_label))
    cv2.putText(hand_landmarks_img, sign, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 128), 2)

    # Display the resulting frame
    cv2.imshow('Hand Landmarks', hand_landmarks_img)
    
    # Check for 'q' or escape key press to exit the loop
    if cv2.waitKey(1) & 0xFF in [ord('q'), ord('Q'), 27]:
        break

cap.release()
cv2.destroyAllWindows()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27

In [54]:
cap.release()
cv2.destroyAllWindows()