In [1]:
pip install mediapipe

Note: you may need to restart the kernel to use updated packages.


In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
import numpy as np
import mediapipe as mp
import cv2
import os

In [2]:
def extract_keypoints(results):
    face_lm = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    pose_lm = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    left_hand_lm = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand_lm = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose_lm, face_lm, left_hand_lm, right_hand_lm])


def get_labels(file_name):
    with open(file_name) as r: 
        labels = r.read().splitlines()
    return np.array(labels)

actions = get_labels("labels.txt")

In [3]:
actions

array(['hello', 'please', 'yes', 'no', 'eat', 'food', 'help', 'need',
       'bathroom', 'sick', 'phone', 'I', 'love', 'you', 'who', 'time',
       'happy', 'sad', 'hungry', 'thirsty', 'where', 'restaurant',
       'thank you', 'goodbye', 'what time is it', 'nice to meet you',
       'where are you from', 'see you later'], dtype='<U18')

### First model configurations

In [12]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation = 'relu', input_shape= (30, 1662)))
model.add(LSTM(128, return_sequences=True, activation = 'relu'))
model.add(LSTM(64, return_sequences=False, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(actions.shape[0], activation = 'softmax'))

model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])


### Fine_tuned version

In [4]:
model = Sequential()
model.add(LSTM(100, return_sequences=True, activation = 'relu', input_shape= (30, 1662)))
model.add(LSTM(100, return_sequences=False, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(actions.shape[0], activation = 'softmax'))

model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [5]:
model.load_weights('Model/sign_detector_finer.h5')

In [6]:
#Model for getting keypoints
mp_holistic = mp.solutions.holistic
#Utilities for drawing keypoints on frames
mp_drawing = mp.solutions.drawing_utils

def collect_landmarks(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                              mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius = 1),
                             mp_drawing.DrawingSpec(color= (80, 256, 121), thickness=1 , circle_radius=1)
                             )
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80, 22, 10), thickness=1, circle_radius = 1),
                             mp_drawing.DrawingSpec(color= (80, 44, 121), thickness=1 , circle_radius=1)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80, 22, 76), thickness=1, circle_radius = 4),
                             mp_drawing.DrawingSpec(color= (80, 44, 250), thickness=1 , circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=1, circle_radius = 4),
                             mp_drawing.DrawingSpec(color= (245, 66, 230), thickness=1 , circle_radius=2)
                             )

In [26]:
sequence = []
sentence = []
threshold = 0.75


cap = cv2.VideoCapture(0)
#Setup mediaPipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence = 0.5) as mp_holistics_model:
    while cap.isOpened():

        #Read video frame
        ret, frame = cap.read()

        #Make keypoint detections
        image, results = collect_landmarks(frame, mp_holistics_model)
        print(results)
        
        #draw_landmarks
        draw_landmarks(image, results)
        
        #Performing predictions
        keypoints = extract_keypoints(results)
        sequence.insert(0, keypoints)
        sequence = sequence[:30]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            
            
        #visualizing predictions
        if res[np.argmax(res)] > threshold:
            if len(sentence) > 0:
                if actions[np.argmax(res)] != sentence[-1]:
                    sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
                
        if len(sentence) > 5:
            sentence = sentence[-5:]
            cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3, 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
        
        #Display frames
        cv2.imshow("OpenCV feed", image)
        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

    #release resources
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>


### Attempting to collect keypoints and pass them on model directly

What I am trying to do is:

1. 3-second timer
2. Record 30 frames 
3. Organize data in the right format
4. Feed the model with the preprocessed data

In [16]:
pwd

'C:\\Users\\cachehit1110\\Desktop\\Capstone Project\\Project'

In [115]:
#Current action
sequence = []
test_folder = 'Testing'
sequence_length = 30

cap = cv2.VideoCapture(0)
#Setup mediaPipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence = 0.5) as mp_holistics_model:
    
    
    for frame_num in range(sequence_length):
        #Read video frame
        ret, frame = cap.read()

        #Make keypoint detections
        image, results = collect_landmarks(frame, mp_holistics_model)
        print(results)

        #draw_landmarks
        draw_landmarks(image, results)

        #Organize data collection with timers
        if frame_num == 0:
            cv2.putText(image, 'STARTING COLLECTION', (200, 200),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
            cv2.putText(image, 'Collecting frames for prediction', (15, 12),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)

            #Display frames
            cv2.imshow("OpenCV feed", image)
            cv2.waitKey(2000)

        else:
            cv2.putText(image, 'Start signing!', (15, 12),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 4, cv2.LINE_AA)

            #Display frames
            cv2.imshow("OpenCV feed", image)


        #Export keypoints to folders
        keypoints = extract_keypoints(results)
        sequence.insert(0, keypoints)
        sequence = sequence[:30]
        #npy_path = os.path.join(test_folder, str(frame_num))
        #np.save(npy_path, keypoints)

            
        #Display frames
        cv2.imshow("OpenCV feed", image)

        #Break gracefully
        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

    #release resources
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

Predict the recorded sign

In [117]:
if len(sequence) == 30:
        res = model.predict(np.expand_dims(sequence, axis=0))[0]
        print(actions[np.argmax(res)])

thirsty


### Converting the model to tflite 

In [121]:
import tensorflow as tf

model_ = tf.keras.models.load_model('Model/sign_detector_finer.h5')
converter = tf.lite.TFLiteConverter.from_keras_model(model_)
tflite_model = converter.convert()
open("Model/lstm_model.tflite", "wb").write(tflite_model)

INFO:tensorflow:Assets written to: C:\Users\CACHEH~1\AppData\Local\Temp\tmp068hmsev\assets


3270912

In [129]:
import tensorflow as tf

interpreter = tf.lite.Interpreter(model_path="lstm_model.tflite")
interpreter.allocate_tensors()
    
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

#print(input_details)
print(output_details)

[{'name': 'Identity', 'index': 55, 'shape': array([ 1, 28]), 'shape_signature': array([-1, 28]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


In [130]:
height = input_details[0]['shape'][1]
width = input_details[0]['shape'][2]
print(height, width)

30 1662
