## 1. Import and Install Dependencies

In [2]:
# !pip install tensorflow opencv-python mediapipe sklearn matplotlib

In [3]:
import cv2
import numpy as np
import os
from matplotlib import pyplot
import time
import mediapipe

## 2. Keypoints using MP Holistic

In [4]:
mp_holistic = mediapipe.solutions.holistic # Holistic model # detects points
mp_drawing = mediapipe.solutions.drawing_utils # Drawing utilities # draws out detection

In [5]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # converts color space from BGR to RGB which saves some memory
    image.flags.writeable = False                  # marks image as unwriteable
    results = model.process(image)                 # Makes prediction
    image.flags.writeable = True                   # converts back
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # converts back
    return image, results

In [6]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [7]:
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                                 mp_drawing.DrawingSpec(color=(68,64,71), thickness=1, circle_radius=1), # joint spec
                                 mp_drawing.DrawingSpec(color=(119,155,0), thickness=1, circle_radius=1) # line spec 
                             )
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), # joint spec
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2) # line spec
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), # joint spec
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2) # line spec
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), # joint spec
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2) # line spec
                             )

In [8]:
# cap = cv2.VideoCapture(0) # access webcam

# # Set mediapipe model
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened(): # while accessing webcam
#         # read feed (the current frame)
#         ret, frame = cap.read() 
        
#         # Make detections
#         image, results = mediapipe_detection(frame, holistic) # holistic: model

#         # Draw landmarks
#         draw_styled_landmarks(image, results)
        
#         # show to screen (frame name, actual frame)
#         cv2.imshow('OpenCV Feed', image) 
        
#         # condition to close gracefully WHEN:
#         #     waited for 0.01 sec for a keypress & keypress is 'q', OR
#         #     the [X] button on the window is clicked
#         if (cv2.waitKey(10) & 0xFF == ord('q')) or (cv2.getWindowProperty('OpenCV Feed', cv2.WND_PROP_VISIBLE) < 1): 
#             break
        
# cap.release()
# cv2.destroyAllWindows()

## 3. Extract Keypoint Values

In [9]:
# pose = []
# for res in results.pose_landmarks.landmark:
#     test = np.array([res.x, res.y, res.z, res.visibility])
#     pose.append(test)

In [10]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    leftHand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rightHand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, leftHand, rightHand])

## 4. Setup Folders for Collection

In [11]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data')

# Action that we try to detect
# actions = np.array(['hello', 'thanks', 'iloveyou', 'I', 'you', 'deaf', 'hearing', 'what_question', 'what_relative_clause'])
actions = np.array(['deaf', 'hearing', 'thanks'])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30


In [12]:
# Create folders for each sign:
# hello
## 0
## 1
## ...
## 29
# thanks ...
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

## 5. Collect Keypoint Values for Training and Testing

In [13]:
# cap = cv2.VideoCapture(0) # access webcam

# # Set mediapipe model
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     breakout = False
#     # Loop through actions
#     for action in actions:
#         if breakout:
#             break
#         # Loop through sequences aka videos
#         for sequence in range(no_sequences):
#             if breakout:
#                 break
#             # Loop through video length aka sequence length
#             for frame_num in range(sequence_length):
            
#                 # read feed (the current frame)
#                 ret, frame = cap.read() 
                
#                 # Make detections
#                 image, results = mediapipe_detection(frame, holistic) # holistic: model
        
#                 # Draw landmarks
#                 draw_styled_landmarks(image, results)
    
#                 # Apply wait logic
#                 if frame_num == 0:
#                     cv2.putText(image, 'STARTING COLLECTION', (120, 200),
#                                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)
#                     cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
#                                     cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
#                     cv2.waitKey(500)
#                 else:
#                     cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
#                                     cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
#                 # Export keypoints
#                 keypoints = extract_keypoints(results)
#                 npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
#                 np.save(npy_path, keypoints)
                                
#                 # show to screen (frame name, actual frame)
#                 cv2.imshow('OpenCV Feed', image) 
                
#                 # condition to close gracefully WHEN:
#                 #     waited for 0.01 sec for a keypress & keypress is 'q', OR
#                 #     the [X] button on the window is clicked
#                 if (cv2.waitKey(10) & 0xFF == ord('q')) or (cv2.getWindowProperty('OpenCV Feed', cv2.WND_PROP_VISIBLE) < 1): 
#                     breakout = True
#                     break
            
#     cap.release()
#     cv2.destroyAllWindows()

## 6. Preprocess Data and Create Labels and Features

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [15]:
label_map = {label:num for num, label in enumerate(actions)}

In [16]:
label_map

{'deaf': 0, 'hearing': 1}

In [17]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [18]:
x = np.array(sequences)
y = to_categorical(labels).astype(int)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05)

## 7. Build and Train LSTM Neural Network

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [21]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir) # web app to monitor training

In [22]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662))) # 30 frames, 1662 keypoints
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [23]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [24]:
model.fit(x_train, y_train, epochs=90, callbacks=[tb_callback])

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
E

<keras.src.callbacks.History at 0x20a64413bd0>

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 64)            442112    
                                                                 
 lstm_1 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
Total params: 596642 (2.28 MB)
Trainable params: 596642 

## 8.Make Predictions

In [26]:
res = model.predict(x_test)



In [27]:
actions[np.argmax(res[2])]

'hearing'

In [28]:
actions[np.argmax(y_test[2])]

'hearing'

In [29]:
# res[np.argmax(res)]

## 9. Save Weights

In [30]:
# model.save('action.keras') # save model

In [31]:
#model.load_weights('action.keras') # reload model after initializing and compiling (7)

## Evaluation using Confusion Matrix and Accuracy

In [32]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [33]:
yhat = model.predict(x_test)



In [34]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [35]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[2, 0],
        [1, 0]],

       [[0, 1],
        [0, 2]]], dtype=int64)

In [36]:
accuracy_score(ytrue, yhat)

0.6666666666666666

## 11. Test in Real Time

In [40]:
# 1. Detection Variables
sequence = []
sentence = []
threshold = 0.4

cap = cv2.VideoCapture(0) # access webcam

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened(): # while accessing webcam
        # read feed (the current frame)
        # timer = 0
        ret, frame = cap.read() 
        
        # Make detections
        image, results = mediapipe_detection(frame, holistic) # holistic: model

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        res = []
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
        
        # visualization logic
        if (len(res) > 0):
            if res[np.argmax(res)] > threshold:
                if len(sentence) > 0:
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                sentence.append(actions[np.argmax(res)])
                    # timer += 1
    
            if len(sentence) > 5:
                sentence = sentence[-5:]

        cv2.rectangle(image, (0,0), (640,40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
        # show to screen (frame name, actual frame)
        cv2.imshow('OpenCV Feed', image) 
        
        # condition to close gracefully WHEN:
        #     waited for 0.01 sec for a keypress & keypress is 'q', OR
        #     the [X] button on the window is clicked
        if (cv2.waitKey(10) & 0xFF == ord('q')) or (cv2.getWindowProperty('OpenCV Feed', cv2.WND_PROP_VISIBLE) < 1): 
            break
        
cap.release()
cv2.destroyAllWindows()

hearing
hearing
hearing
hearing
hearing
hearing
deaf
deaf
hearing
hearing
hearing
deaf
deaf
deaf
hearing
hearing
hearing
deaf
deaf
deaf
deaf
hearing
deaf
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
hearing
deaf
deaf
hearing
hearing
hearing
deaf
deaf
deaf
hearing
deaf
hearing
deaf
hearing
hearing
hearing
deaf
deaf
deaf
hearing
hearing
hearing
deaf
hearing
hearing
deaf
hearing
deaf
hearing
deaf
deaf
deaf
deaf
deaf
hearing
hearing
deaf
deaf
deaf
deaf
deaf
hearing
hearing
deaf
deaf
deaf
deaf
hearing
hearing
hearing
hearing
hearing
hearing


In [38]:
model.predict??

[1;31mSignature:[0m
[0mmodel[0m[1;33m.[0m[0mpredict[0m[1;33m([0m[1;33m
[0m    [0mx[0m[1;33m,[0m[1;33m
[0m    [0mbatch_size[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0msteps[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcallbacks[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmax_queue_size[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m[1;33m
[0m    [0mworkers[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0muse_multiprocessing[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
    [1;33m@[0m[0mtraceback_utils[0m[1;33m.[0m[0mfilter_traceback[0m[1;33m
[0m    [1;32mdef[0m [0mpredict[0m[1;33m([0m[1;33m
[0m        [0mself[0m[1;33m,[0m[1;33m
[0m        [0mx[0m[1;33m,[0m[1;33m
[0m        [0mbatch_size[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0