# 1. TESTING WITH WEBCAM

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import tensorflow as tf

2021-10-26 09:18:44.197965: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-26 09:18:44.197998: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# 2. Keypoints using MP Holistic

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [4]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
    #                          mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
    #                          mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
    #                          ) 
    # # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

# 3. Extract Keypoint Values

In [6]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    # face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

In [7]:
actions = os.listdir('./MP_Data')
actions = np.array(actions)

# Path for exported data, numpy arrays
DATA_PATH = os.path.join('./MP_Data') 

# Thirty videos worth of data
no_sequences = 80

# Videos are going to be 30 frames in length
sequence_length = 24

In [8]:
label_map = {label:num for num, label in enumerate(actions)}

In [9]:
label_map

{'hello': 0,
 'how much': 1,
 'thank you': 2,
 'name': 3,
 'cost': 4,
 'your': 5,
 'hungry': 6,
 'phone': 7,
 'beautiful': 8,
 'good job': 9,
 'sad': 10,
 'what': 11,
 'father': 12,
 'i love you': 13}

# Load model

In [27]:
# Load model:
# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model('./pretrained_models/model_13classes_261021.h5')

# Show the model architecture
model.summary()

# We need to define its optimizer and loss function again since the h5 file
# does not contain those information :(
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 24, 64)            82688     
_________________________________________________________________
lstm_4 (LSTM)                (None, 24, 128)           98816     
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 14)               

# 11. Text to speak

In [11]:
import pyttsx3

engine = pyttsx3.init()
engine.say('see you later')
engine.runAndWait()

In [12]:
# Show all language
voices = engine.getProperty('voices')
voice_lang = {idx: str(voice.languages[0]).split('5')[-1] for idx, voice in enumerate(voices)}
print(voice_lang)


{0: "af'", 1: "an'", 2: "bg'", 3: "bs'", 4: "ca'", 5: "cs'", 6: "cy'", 7: "da'", 8: "de'", 9: "el'", 10: "en'", 11: "b'\\x02en-gb'", 12: "en-sc'", 13: "en-uk-north'", 14: "en-uk-rp'", 15: "en-uk-wmids'", 16: "b'\\x02en-us'", 17: "en-wi'", 18: "eo'", 19: "es'", 20: "es-la'", 21: "et'", 22: "fa'", 23: "fa-pin'", 24: "fi'", 25: "fr-be'", 26: "fr-fr'", 27: "ga'", 28: "grc'", 29: "hi'", 30: "hr'", 31: "hu'", 32: "hy'", 33: "hy-west'", 34: "id'", 35: "is'", 36: "it'", 37: "jbo'", 38: "ka'", 39: "kn'", 40: "ku'", 41: "la'", 42: "lfn'", 43: "lt'", 44: "lv'", 45: "mk'", 46: "ml'", 47: "ms'", 48: "ne'", 49: "nl'", 50: "no'", 51: "pa'", 52: "pl'", 53: "pt-br'", 54: "pt-pt'", 55: "ro'", 56: "ru'", 57: "sk'", 58: "sq'", 59: "sr'", 60: "sv'", 61: "sw'", 62: "ta'", 63: "tr'", 64: "vi'", 65: "vi-hue'", 66: "vi-sgn'", 67: "zh'", 68: "zh-yue'"}


In [13]:
# Text to speak config
engine = pyttsx3.init()
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[16].id)
engine.say('hello, this is the machine voice')
engine.runAndWait()



# 12. Test in Real Time

In [14]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()

    pred_dict = dict(zip(actions, res))
    # sorting for prediction and get top 5
    prediction = sorted(pred_dict.items(), key=lambda x: x[1])[::-1][:5]

    for num, pred in enumerate(prediction):
        text = '{}: {}'.format(pred[0], pred[1])
        # cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, text, (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,0), 2, cv2.LINE_AA)
        
    return output_frame

In [26]:
# New detection variables
sequence = []
sentence = []
threshold = 0.9
tts = False

# Text to speak config:
engine = pyttsx3.init()
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[16].id)

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)

        # sequence.insert(0,keypoints)
        # sequence = sequence[:24]
        sequence.append(keypoints)
        sequence = sequence[-24:]
        
        if len(sequence) == 24:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]

            #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        # if prediction is 'goodjob' and last value is 'what':
                        if actions[np.argmax(res)] == 'good job' and sentence[-1] == 'what':
                            pass
                        # if prediction is 'goodjob' and last value is 'name':
                        elif actions[np.argmax(res)] == 'good job' and sentence[-1] == 'name':
                            pass
                        # if prediction is 'what' and last value is 'your':
                        elif actions[np.argmax(res)] == 'what' and sentence[-1] == 'your':
                            pass
                        else:
                            sentence.append(actions[np.argmax(res)])
                            tts = True
                else:
                    sentence.append(actions[np.argmax(res)])
                    tts = True

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)

            # Text to speak:
            if tts:
                engine.say(sentence[-1])
                tts = False
            engine.runAndWait()
            
            # 
            cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3,30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
        
    cap.release()
    cv2.destroyAllWindows()

In [21]:
    cap.release()
    cv2.destroyAllWindows()