### Install dependencies

In [1]:
!pip install tensorflow tensorflow-gpu opencv-python mediapipe sklearn matplotlib

Collecting tensorflow
  Downloading tensorflow-2.9.1-cp310-cp310-win_amd64.whl (444.1 MB)
     -------------------------------------- 444.1/444.1 MB 1.5 MB/s eta 0:00:00
Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.9.1-cp310-cp310-win_amd64.whl (444.1 MB)
     -------------------------------------- 444.1/444.1 MB 1.3 MB/s eta 0:00:00
Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl (35.6 MB)
     ---------------------------------------- 35.6/35.6 MB 1.7 MB/s eta 0:00:00
Collecting mediapipe
  Downloading mediapipe-0.8.10-cp310-cp310-win_amd64.whl (48.6 MB)
     ---------------------------------------- 48.6/48.6 MB 1.7 MB/s eta 0:00:00
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting matplotlib
  Downloading matplotlib-3.5.2-cp310-cp310-win_amd64.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 

### Import dependencies and function definition

In [7]:
import os, time, cv2 
import numpy as np 
import mediapipe as mp 
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

mp_holistic = mp.solutions.holistic 
mp_drawing = mp.solutions.drawing_utils 

def detect_features(image, model): 
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False 
    result = model.process(image)
    image.flags.writeable = True 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, result

def display_landmarks(image, result): 
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(252,198,3), thickness=2, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)) 
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(252,198,3), thickness=2, circle_radius=1),  
                             mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)) 
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(252,198,3), thickness=2, circle_radius=1),  
                             mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2))

def extract_keypoints(results): 
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

def prob_viz(res, actions, input_frame):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), (255, 255, 255), -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,0), 2, cv2.LINE_AA)
    return output_frame

labels = ['1', '2', 'Amar', 'Salam', 'Nam']
actions = np.array(labels)

### Demo

In [11]:
video = cv2.VideoCapture(1)
with mp_holistic.Holistic(
    model_complexity=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.6
) as holistic:
    while video.isOpened(): 
        ret, frame = video.read() 
        image, results = detect_features(frame, holistic)
        display_landmarks(image, results)
        cv2.imshow("Tracking Demo", image) 
        if cv2.waitKey(10) & 0xFF == ord('q'): 
            break
    video.release() 
    cv2.destroyAllWindows() 

### Setup dataset folder

In [12]:
DATA_PATH = os.path.join('Dataset') 

number_of_videos = 30
number_of_frames = 30

inputFor = '2'
current_input = labels.index(inputFor)
directory = f'Dataset/{inputFor}'
start_folder = len(next(os.walk(directory))[1]) if os.path.exists(directory) else 0

for video_number in range(start_folder, start_folder + number_of_videos):
    os.makedirs(os.path.join(DATA_PATH, labels[current_input], str(video_number)))

### Collect keypoints as features

In [13]:
cap = cv2.VideoCapture(1)
with mp_holistic.Holistic(
    model_complexity=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.6
) as holistic:
    for video_number in range(start_folder, start_folder + number_of_videos):
        for frame_num in range(number_of_frames):
            ret, frame = cap.read()
            image, results = detect_features(frame, holistic)
            display_landmarks(image, results)
            if frame_num == 0: 
                cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(labels[current_input], video_number), (15,12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                cv2.imshow('Collecting Features', image)
                cv2.waitKey(2000)
            else: 
                cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(labels[current_input], video_number), (15,12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                cv2.imshow('Collecting Features', image)
            keypoints = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, labels[current_input], str(video_number), str(frame_num))
            np.save(npy_path, keypoints)
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
    cap.release()
    cv2.destroyAllWindows()
cap.release()
cv2.destroyAllWindows()

### Preprocess data

In [293]:
label_map = {dt:num for num, dt in enumerate(actions)}
videos, label_data = [], []
for action in actions:
    directory = f'Dataset/{action}'
    for video_number in range(len(next(os.walk(directory))[1])):
        window = []
        for frame_num in range(number_of_frames):
            res = np.load(os.path.join(DATA_PATH, action, str(video_number), "{}.npy".format(frame_num)))
            window.append(res)
        videos.append(window)
        label_data.append(label_map[action])
        
X = np.array(videos)
y = to_categorical(label_data).astype(int)
print(f"\nTotal Videos: {X.shape[0]}\nNumber of Frames(each): {X.shape[1]}\nTotal Keypoints: {X.shape[2]}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
print(f"Number of trainning data: {X_train.shape[0]}\nNumber of test data: {X_test.shape[0]}")
print(np.array(videos).shape)


Total Videos: 800
Number of Frames(each): 30
Total Keypoints: 258
Number of trainning data: 760
Number of test data: 40
(800, 30, 258)


### Build and Train LSTM Neural Network

In [295]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(number_of_frames, X.shape[2])))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit(X_train, y_train, epochs=100, callbacks=[tb_callback])
model.summary()

model.save('eashara.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Model: "sequential_45"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_135 (LSTM)             (None, 30, 64)            82688     
                                                                 
 lstm_136 (LSTM)             (None, 30, 128)           98816     
                                                                 
 lstm_137 (LSTM)             (None, 64)                49408     
                                                                 
 dense_135 (Dense)           (None, 64)                4160      
                               

### Evaluation using Confusion Matrix and Accuracy

In [4]:
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()
multilabel_confusion_matrix(ytrue, yhat)

NameError: name 'model' is not defined

In [267]:
accuracy_score(ytrue, yhat)

0.96875

### Test in Real Time 

In [5]:
from keras.models import load_model
model = load_model('eashara.h5')

sequence = []
sentence = []
threshold = 0.99

cap = cv2.VideoCapture(1)

with mp_holistic.Holistic(
    min_detection_confidence=0.6, 
    min_tracking_confidence=0.6
) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = detect_features(frame, holistic)
        
        display_landmarks(image, results)
        keypoints = extract_keypoints(results)
        
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            #print(actions[np.argmax(res)])
            
            
        #. Viz logic
            if res[np.argmax(res)] >= threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30),cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        cv2.imshow('RealTime Translation', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()
cap.release()
cv2.destroyAllWindows()



