In [9]:
# imports
import mediapipe as mp
import numpy as np
import cv2

In [10]:
class MediaPipe:
    MEDIAPIPE_HOLISTIC = mp.solutions.holistic
    MEDIAPIPE_DRAWING = mp.solutions.drawing_utils
    SIGN_LABELS = np.array(['hello', 'yes', 'no', 'peace', 'thumbs_up']) # TODO: MAKE A WAY TO SEE IF NO SIGN
    # SIGN_LABELS = np.array(['hello', 'no'])


    def detect_landmarks(image, holistic_model):
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        holistic_results = holistic_model.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        return image, holistic_results


    def extract_landmarks(holistic_results):
        # TODO: train model only using hand outlines,
        # pose = np.array([[coord.x, coord.y, coord.z, coord.visibility] for coord in holistic_results.pose_landmarks.landmark]).flatten() if holistic_results.pose_landmarks else np.zeros(33*4)
        # face = np.array([[coord.x, coord.y, coord.z] for coord in holistic_results.face_landmarks.landmark]).flatten() if holistic_results.face_landmarks else np.zeros(468*3)
        left_hand = np.array([[coord.x, coord.y, coord.z] for coord in holistic_results.left_hand_landmarks.landmark]).flatten() if holistic_results.left_hand_landmarks else np.zeros(21*3)
        right_hand = np.array([[coord.x, coord.y, coord.z] for coord in holistic_results.right_hand_landmarks.landmark]).flatten() if holistic_results.right_hand_landmarks else np.zeros(21*3)
        # return np.concatenate([pose, face, left_hand, right_hand]) # TODO: see if this somehow affects the model
        return np.concatenate([left_hand, right_hand]) # TODO: see if this somehow affects the model


    # TODO: make a mode to toggle drawing on and off
    def draw_landmarks(image, holistic_results):
        # TODO: play with the colors
        # TODO: maybe make a for loop for these
        # MediaPipe.MEDIAPIPE_DRAWING.draw_landmarks(image, holistic_results.face_landmarks, MediaPipe.MEDIAPIPE_HOLISTIC.FACEMESH_TESSELATION,
        #                                             MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
        #                                             MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1))
        # MediaPipe.MEDIAPIPE_DRAWING.draw_landmarks(image, holistic_results.pose_landmarks, MediaPipe.MEDIAPIPE_HOLISTIC.POSE_CONNECTIONS,
        #                                             MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
        #                                             MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1))
        MediaPipe.MEDIAPIPE_DRAWING.draw_landmarks(image, holistic_results.left_hand_landmarks, MediaPipe.MEDIAPIPE_HOLISTIC.HAND_CONNECTIONS,
                                                   MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
                                                   MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1))
        MediaPipe.MEDIAPIPE_DRAWING.draw_landmarks(image, holistic_results.right_hand_landmarks, MediaPipe.MEDIAPIPE_HOLISTIC.HAND_CONNECTIONS,
                                                   MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
                                                   MediaPipe.MEDIAPIPE_DRAWING.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1))


    def probability_visualization(model_results, input_frame):
        # colors = [(245,117,16), (117,245,16), (16,117,245)] # ???
        colors = [(0,0,0), (0,0,0), (0,0,0), (0,0,0), (0,0,0)] # ???
        output_frame = input_frame.copy()
        for num, prob in enumerate(model_results):
            # what are these ?
            cv2.rectangle(output_frame, (0, 60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
            cv2.putText(output_frame, MediaPipe.SIGN_LABELS[num] + " " + str(int(prob*100)), (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

        return output_frame

In [11]:
from keras.models import load_model

In [12]:
!cd

d:\GitHubProjects\SignLanguageRecognition\CodeForMachineWithCamera


In [13]:
class Model:
    def __init__(self, model_path):
        self.model = load_model(model_path)
        self.model.summary()

In [14]:
class Detection:
    def detect(recognition_model, camera=0): # may be passed as url
        sequence = []
        sentence = []
        threshold = 0.8 # if a sign is detected with 80% probability it will be counted

        capture = cv2.VideoCapture(camera)

        # TODO: play with the min_detection_confidence and min_tracking_confidence values
        with MediaPipe.MEDIAPIPE_HOLISTIC.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic_model:
            while capture.isOpened():
                _, frame = capture.read()

                # extract landmarks and process the video feed
                image, holistic_results = MediaPipe.detect_landmarks(frame, holistic_model)

                MediaPipe.draw_landmarks(image, holistic_results)
                
                landmarks = MediaPipe.extract_landmarks(holistic_results)
                # print('landmarks:', landmarks)
                # sequence.insert(0, landmarks)
                # sequence = sequence[:30]
                sequence.append(landmarks)
                sequence = sequence[-30:]
                
                # when tere are 30 frames in the sequence
                if len(sequence) == 30:
                    # get model result based on the 30 frames
                    model_result = recognition_model.model.predict(
                        np.expand_dims(sequence, axis=0))[0]
                    print('model_result:', model_result)
                    
                    if(model_result[np.argmax(model_result)] > threshold):
                        if len(sentence) > 0:
                            # if the last sign is not the same as the current sign
                            if MediaPipe.SIGN_LABELS[np.argmax(model_result)] != sentence[-1]:
                                sentence.append(MediaPipe.SIGN_LABELS[np.argmax(model_result)])
                        else:
                            # if the sentence is empty append the predicted sign
                            sentence.append(MediaPipe.SIGN_LABELS[np.argmax(model_result)])
                    
                    # if there are 5 words, remove the first word
                    # if len(sentence) > 5:
                    #     sentence = sentence[-5:]
                    
                    image = MediaPipe.probability_visualization(model_result, image)
                    
                
                # cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
                # cv2.putText(image, ' '.join(sentence), (3,30),
                #             cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

                cv2.imshow('Video Feed', image)
                
                # press 'q' to exit
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
            
            capture.release()
            cv2.destroyAllWindows()
                
    

In [15]:
# run the detection

# TODO: try with .keras too
# model_path = "../CodeForMachineWithGPU/models/NewModel.keras"
# model_path = "../CodeForMachineWithGPU/models/ActionRecModel.h5"
# model_path = "./CodeForMachineWithGPU/models/Model7.h5"
model_path = "../CodeForMachineWithGPU/models/Model3k.h5" # works
# model_path = "./CodeForMachineWithGPU/models/ActionRecModel.h5"
sign_recognition_model = Model(model_path)

# Detection.detect(camera="https://192.168.175.101:8080/video")
Detection.detect(recognition_model=sign_recognition_model)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 64)            48896     
                                                                 
 lstm_1 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 5)                 165       
                                                                 
Total params: 203525 (795.02 KB)
Trainable params: 20352

In [16]:
# capture.release()
# cv2.destroyAllWindows()