# Preliminary Notebook

## 1. Import and Install Dependencies

In [1]:
%pip install tensorflow-macos opencv-python mediapipe-silicon sklearn matplotlib
#!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import cv2 # opencv
import numpy as np
import os # easier file path handling
from matplotlib import pyplot as plt # im.show for easy visualization
import time # to insert "sleep" in between frames
import mediapipe as mp # for accessing and reading from webcam

objc[8762]: Class CaptureDelegate is implemented in both /Users/jin-holee/neuefische/Capstone_Project_SignMeUp/.venv/lib/python3.9/site-packages/cv2/cv2.abi3.so (0x12e7b25a0) and /Users/jin-holee/neuefische/Capstone_Project_SignMeUp/.venv/lib/python3.9/site-packages/mediapipe/.dylibs/libopencv_videoio.3.4.16.dylib (0x12aa88860). One of the two will be used. Which one is undefined.
objc[8762]: Class CVWindow is implemented in both /Users/jin-holee/neuefische/Capstone_Project_SignMeUp/.venv/lib/python3.9/site-packages/cv2/cv2.abi3.so (0x12e7b25f0) and /Users/jin-holee/neuefische/Capstone_Project_SignMeUp/.venv/lib/python3.9/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x129cb8a68). One of the two will be used. Which one is undefined.
objc[8762]: Class CVView is implemented in both /Users/jin-holee/neuefische/Capstone_Project_SignMeUp/.venv/lib/python3.9/site-packages/cv2/cv2.abi3.so (0x12e7b2618) and /Users/jin-holee/neuefische/Capstone_Project_SignMeUp/.venv/lib/pytho

## 6. Preprocess Data and Create Labels and Features

In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

### Some Global Stuff

In [4]:
# path for exported data (numpy arrays)
DATA_PATH = os.path.join('MP_Data_test')
# DATA_PATH = os.path.join('MP_Data)

# actions to detect
actions = np.array(['hello', 'thanks', 'iloveyou'])

# 30 videos of data
no_sequences = 30

# each video with 30 frames
sequence_length = 30

In [5]:
# create label map (dict, our .json file)
label_map = {label:num for num, label in enumerate(actions)}

### Loading Data

In [6]:
sequences, labels = [], [] # sequences will be x data, labels will be y data
# loop over all actions (words)
for action in actions: 
    # loop over all sequences (videos)
    for sequence in range(no_sequences): 
        window = [] # represents all frames of particular sequence (video)
        # loop through each frame
        for frame_num in range(sequence_length): 
            # load up current frame (frame_num)
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res) # append to one video
        sequences.append(window) # append all videos to sequence (for a word)
        labels.append(label_map[action])

In [7]:
X = np.array(sequences) # shape: (90, 30, 1662)

In [8]:
y = to_categorical(labels).astype(int) # one-hot encoded labels (words)
# shape: (90, 3)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

## Load Saved Model

In [10]:
# re-initialize the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

# compile the model
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])


In [11]:
model.load_weights('action.h5')

## 10. Evaluation using Confusion Matrix and Accuracy

In [12]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [13]:
yhat = model.predict(X_test)

2023-04-05 19:52:09.790489: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [14]:
# convert one-hot encoded categories to labels, e.g. 0, 1 and 2
# instead of [1,0,0], [0,1,0], [0,0,1]
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [15]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[1, 0],
        [4, 0]],

       [[4, 0],
        [0, 1]],

       [[1, 4],
        [0, 0]]])

In [16]:
accuracy_score(ytrue, yhat)

0.2

## 2. Keypoints using MP Holistic (again needed here)

In [17]:
mp_holistic = mp.solutions.holistic # holistic model
mp_drawing = mp.solutions.drawing_utils # drawing utilities

In [18]:
def mediapipe_detection(image, model): 
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # color conversion BGR to RGB
    image.flags.writeable = False                   # image no longer writeable
    results = model.process(image)                  # make prediction
    image.flags.writeable = True                    # image is writeable again
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # color conversion back to original
    return image, results


In [19]:
def draw_landmarks(image, results): 
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION) # draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # draw right hand connections

In [20]:
def draw_styled_landmarks(image, results): 
    # draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    # draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=2, circle_radius=2)) 
    # draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                              mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)) 
    # draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                              mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)) 

## 3. Extract Keypoint Values (again needed here)

In [21]:
def extract_keypoints(results): 
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4) # x, y, z and extra value visibility
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])
    # a flattened list with list of all pose, face, lh, rh landmark x, y, z, (+visibility) coordinates

## 11. Test in Real Time

In [22]:
colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245)] # colors for each word (bars)
def prob_viz(res, actions, input_frame, colors): 
    output_frame = input_frame.copy()
    for num, prob in enumerate(res): 
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
    
    return output_frame

In [23]:
# 1. New detection variables 
sequence = [] # to collect all 30 frames for prediction
sentence = [] # history of all predictions (predicted words)
predictions = []
threshold = 0.4 # confidence metrics (only render prediction results, if confidence is above threshold)

cap = cv2.VideoCapture(0) # grabbing webcam
# set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: 
    while cap.isOpened(): # loop through all frames 

        # read feed
        ret, frame = cap.read()

        # make detections 
        image, results = mediapipe_detection(frame, holistic)
        #print(results)

        # draw landmarks
        #draw_landmarks(image, results)
        draw_styled_landmarks(image, results)

        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:] # takes last thirty frames

        if len(sequence) == 30: 
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            # np.expand_dims to adjust input for a single sequence (word) as it would be of shape (30, 1662)
            # but the model.fit() expects something in shape (num_sequences, 30, 1662), e.g. (1, 30, 1662) for a single sequence
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))

        # 3. Visualization logic
            # makes sure the last 15 frames had the same prediction (more stable transition from one sign to another) 
            if np.unique(predictions[-15:])[0]==np.argmax(res): 
                # if the confidence of the most confident prediction is above threshold
                if res[np.argmax(res)] > threshold: 
                    # if there is already a last prediction
                    if len(sentence) > 0: 
                        # only append the predicted word, if it differs from the last prediction (prevent double actions)
                        if actions[np.argmax(res)] != sentence[-1]: 
                            sentence.append(actions[np.argmax(res)])
                    # just append if there is no last prediction (first prediction)
                    else: 
                        sentence.append(actions[np.argmax(res)])

            # limit the history to the last 5 predictions
            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # viz probabilities
            image = prob_viz(res, actions, image, colors)

        # some rendering
        cv2.rectangle(image, (0, 0), (640, 40), (245, 11, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # show to screen
        cv2.imshow("OpenCV Feed", image)

        # break gracefully 
        if cv2.waitKey(10) & 0xFF == ord('q'): 
            break 
    cap.release()
    cv2.destroyAllWindows() 

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
thanks
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou