In [None]:
! pip install tensorflow==2.9.2 tensorflow-gpu==2.9.2 opencv-python mediapipe sklearn matplotlib

In [37]:
import cv2 as cv
import numpy as np
import os
import time
from matplotlib import pyplot as plt
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

%matplotlib inline

### Keypoints using Mediapipe Holistic

In [38]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [39]:
def med_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False  # image no longer writeable
    results = model.process(image) # making prediction
    image.flags.writeable = True   # image now writeable
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [40]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Face Connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)  # Pose Connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Left Hand Connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Right Hand Connections

In [41]:
def draw_styled_landmarks(image, results):
    # Face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=2)
                             ) 
    # Pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,256), thickness=2, circle_radius=2)
                             ) 
    # Right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [42]:
cap = cv.VideoCapture(0)
# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret,frame = cap.read()
        
        image, results = med_detection(frame, holistic)
        # print(results)
        if results.left_hand_landmarks:
            landmark_list_pose = results.left_hand_landmarks.landmark
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        cv.imshow('OpenCV Feed', image)
        if cv.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv.destroyAllWindows()

In [43]:
len(landmark_list_pose)

21

In [None]:
draw_styled_landmarks(frame, results)

In [None]:
plt.imshow(cv.cvtColor(frame, cv.COLOR_BGR2RGB))

### Extract Keypoint Values

In [None]:
def extract_keypoints(results):
    # Extracting pose landmarks
    if results.pose_landmarks:
        pose = np.array([])
        for res in results.pose_landmarks.landmark:
            test_pose = np.array([res.x, res.y, res.z, res.visibility])
            pose = np.append(pose, test_pose)
        pose.flatten() # Converting all the co-ordinates into one single array
    else:
        pose = np.zeros(33*4)

    # Extracting face landmarks
    if results.face_landmarks:
        face = np.array([])
        for res in results.face_landmarks.landmark:
            test_face = np.array([res.x, res.y, res.z])
            face = np.append(face, test_face)
        face.flatten()
    else:
        face = np.zeros(468*3)

    # Extracting left hand landmarks
    if results.left_hand_landmarks:
        lh = np.array([])
        for res in results.left_hand_landmarks.landmark:
            test_lh = np.array([res.x, res.y, res.z])
            lh = np.append(lh, test_lh)
        lh.flatten()
    else:
        lh = np.zeros(21*3)

    # Extracting right hand landmarks
    if results.right_hand_landmarks:
        rh = np.array([])
        for res in results.right_hand_landmarks.landmark:
            test_rh = np.array([res.x, res.y, res.z])
            rh = np.append(rh, test_rh)
        rh.flatten()
    else:
        rh = np.zeros(21*3)
           
    return np.concatenate([pose, face, lh, rh])

In [None]:
extract_keypoints(results)

In [None]:
extract_keypoints(results).shape

In [None]:
result_test = extract_keypoints(results)

In [None]:
np.save('result_test', result_test)

### Setting up Folders for collection

In [3]:
PATH = os.path.join('MP_Data')

# Actions we try to detect
actions = ['A', 'B', 'C', 'hello', 'thanks', 'iloveyou']


no_sequences = 30 # 30 keypoints worth data
sequence_length = 30 # 30 frames

In [4]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(PATH, action, str(sequence)))
        except:
            pass

### Collecting keypoint values for Training and Testing

In [None]:
cap = cv.VideoCapture(0)
# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
                
                ret,frame = cap.read()

                image, results = med_detection(frame, holistic)
                # print(results)
                if results.left_hand_landmarks:
                    landmark_list_pose = results.left_hand_landmarks.landmark

                # Draw landmarks
                draw_styled_landmarks(image, results)
                
                # Collection logic
                if frame_num == 0: 
                    cv.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv.LINE_AA)
                    cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                    cv.imshow('OpenCV Feed', image)
                    cv.waitKey(1000)
                else: 
                    cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                    cv.imshow('OpenCV Feed', image)
                # Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                
                if cv.waitKey(10) & 0xFF == ord('q'):
                    break
    cap.release()
    cv.destroyAllWindows()

### Data Preprocessing and creating labels and features

In [5]:
label_map = {label:num for num, label in enumerate(actions)}

In [6]:
label_map

{'A': 0, 'B': 1, 'C': 2, 'hello': 3, 'thanks': 4, 'iloveyou': 5}

In [7]:
sequences, labels = [], []   # sequences--> X data, labels--> y data
for action in actions:
        for sequence in range(no_sequences):
            window = []
            for frame_num in range(sequence_length):
                res = np.load(os.path.join(PATH, action, str(sequence), "{}.npy".format(frame_num)))
                window.append(res)
            sequences.append(window)
            labels.append(label_map[action])

In [8]:
np.array(sequences).shape

(180, 30, 1662)

In [9]:
X = np.array(sequences)

In [10]:
y = to_categorical(labels).astype(int)

In [11]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.05)

### LSTM Neural Network training

In [12]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [13]:
np.shape(actions)[0]

6

In [14]:
res = [.7,.2,.1]
actions[np.argmax(res)]

'A'

In [32]:
model_new = Sequential()
model_new.add(LSTM(128, return_sequences=True, activation='relu', input_shape=(30,1662)))
model_new.add(LSTM(128, return_sequences=False, activation='relu'))
model_new.add(Dense(64, activation='relu'))
model_new.add(Dense(32, activation='relu'))
model_new.add(Dense(np.shape(actions)[0], activation='softmax'))



In [33]:
np.shape(actions)[0]

6

In [34]:
model_new.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model_new.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 30, 128)           916992    
                                                                 
 lstm_10 (LSTM)              (None, 128)               131584    
                                                                 
 dense_12 (Dense)            (None, 64)                8256      
                                                                 
 dense_13 (Dense)            (None, 32)                2080      
                                                                 
 dense_14 (Dense)            (None, 6)                 198       
                                                                 
Total params: 1,059,110
Trainable params: 1,059,110
Non-trainable params: 0
_________________________________________________________________


In [35]:
model_new.fit(X_train, y_train, epochs=1500, callbacks=[tb_callback])

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500

KeyboardInterrupt: 

Accuracy -> 0.8889 after 180 epochs

In [None]:
model.summary()

### Predictions

In [None]:
res = model.predict(X_test)

In [None]:
actions[np.argmax(res[3])]

In [None]:
actions[np.argmax(y_test[3])]

### Saving weights

In [None]:
model.save('action.h5')

In [None]:
# del model
# model.load_weights('action.h5')

### Model Evaluation

In [None]:
y_predicted = model.predict(X_test)

In [None]:
y_true = np.argmax(y_test, axis=1).tolist()
y_predicted = np.argmax(y_predicted, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(y_true, y_predicted)

In [None]:
accuracy_score(y_true, y_predicted)

### Real Time Testing