In [57]:
import cv2 as cv
import numpy as np
import mediapipe as mp

In [58]:
class HandDetector:
    def __init__(self, use_static_image, detection_confidence, tracking_confidence, num_of_hands, sign_labels_file_path, data_set_file_path) -> None:
        # useful objects
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.data_set_file_path = data_set_file_path
        self.sign_labels_file_path = sign_labels_file_path
        self.sign_labels = self.get_sign_labels() # list of sign labels

        # mediapipe model
        self.model = self.mp_hands.Hands(
            static_image_mode=use_static_image,
            min_detection_confidence=detection_confidence,
            min_tracking_confidence=tracking_confidence,
            max_num_hands=num_of_hands
        )


    def mediapipe_detect(self, image):
        image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        image.flags.writeable = False
        mediapipe_results = self.model.process(image)
        image.flags.writeable = True
        image = cv.cvtColor(image, cv.COLOR_RGB2BGR)

        return image, mediapipe_results


    def draw_landmarks(self, image, mediapipe_results):
        if mediapipe_results.multi_hand_landmarks:
            for hand_landmarks in mediapipe_results.multi_hand_landmarks:
                self.mp_drawing.draw_landmarks(
                    image, hand_landmarks, self.mp_hands.HAND_CONNECTIONS,
                    self.mp_drawing.DrawingSpec(color=(153,0,153), thickness=2, circle_radius=1),
                    self.mp_drawing.DrawingSpec(color=(255,255,255), thickness=2, circle_radius=1)
                )

        return image


    def get_landmarks_as_dict(self, mediapipe_results):
        landmarks_dict = []
        if mediapipe_results.multi_hand_landmarks: # len = 1 or 2
            for hand_landmarks in mediapipe_results.multi_hand_landmarks:
                for landmark in hand_landmarks.landmark:
                    landmarks_dict.append({
                        'x': landmark.x,
                        'y': landmark.y
                        # 'z': landmark.z
                    })
        
        return landmarks_dict # len =  21 always(for one hand???)


    def normalize_landmarks_to_local_axis(self, landmarks_dict): # landmarks_list = values between 0 and 1, starting from the top left corner
        # wrist will be 0.0, 0.0, 0.0
        wrist = landmarks_dict[0]
        normalized_landmarks = []
        
        for landmark in landmarks_dict:
            normalized_landmarks.append({
                'x': landmark['x'] - wrist['x'],
                'y': landmark['y'] - wrist['y']
                # 'z': landmark['z'] - wrist['z']
            })
        
        return normalized_landmarks # values between -1 and 1, wrist being 0, 0, 0, len = 21(for one hand)

    
    def convert_normalized_landmarks_to_list(self, normalized_landmarks):
        normalized_landmarks_list = []
        for landmark in normalized_landmarks:
            for key, _ in landmark.items():
                normalized_landmarks_list.append(landmark[key])
        
        return normalized_landmarks_list


    def save_landmarks_to_csv_file(self, normalized_landmarks_list, key_input, status):
        if status.MODE == 's':
            if status.sign_to_save_landmarks_for is not None and status.sign_to_save_landmarks_for < len(self.sign_labels): # make sign_labels part of the class
                if key_input == ord('c'):
                    # check if file exists
                    try:
                        with open(self.data_set_file_path, 'r') as file:
                            pass
                    except FileNotFoundError:
                        print(f'File : {self.data_set_file_path} not found, it will be created.')
                    
                    # add comma after each landmark
                    string_to_save = f"{status.sign_to_save_landmarks_for}," + ','.join(str(landmark) for landmark in normalized_landmarks_list)
                    
                    # write the last landmark
                    with open(self.data_set_file_path, 'a') as file: # a = append
                        file.write(str(string_to_save) + '\n')
                    file.close()


    def get_sign_labels(self):
        # check if file exists
        try:
            with open(self.sign_labels_file_path, 'r') as file:
                pass
        except FileNotFoundError:
            print(f'File : {self.sign_labels_file_path} not found!')
            exit(1) # FIXME: maybe handle this better ?

        with open(self.sign_labels_file_path, 'r') as file:
            sign_labels = file.read().splitlines()
        file.close()
        
        return sign_labels

    
    def count_number_of_saved_landmark(self):
        # check if file exists
        try:
            with open(self.data_set_file_path, 'r') as file:
                pass
        except FileNotFoundError:
            print(f'File : {self.data_set_file_path} not found while trying to count!')
            return None
        
        list_of_counted_signs = [0] * len(self.get_sign_labels())
        
        with open(self.data_set_file_path, 'r') as file:
            for line in file.readlines():
                list_of_counted_signs[int(line[0])] += 1
        
        return list_of_counted_signs


    def find_min_and_max_for_x_and_y(self, landmarks_dict):
        # print(landmarks_dict)
        min_x = 1
        min_y = 1
        max_x = 0
        max_y = 0
        for landmark in landmarks_dict:
            min_x = min(min_x, landmark['x'])
            min_y = min(min_y, landmark['y'])
            max_x = max(max_x, landmark['x'])
            max_y = max(max_y, landmark['y'])
        
        return min_x, min_y, max_x, max_y

In [59]:
class Status:
    def __init__(self) -> None:
        self.MODE = None
        self.sign_to_save_landmarks_for = None
        pass
    
    
    def set_status_mode(self, key_input):
        if key_input == ord('s'):
            self.MODE = 's' # save mode
        elif key_input == ord('d'):
            self.MODE = 'd' # detect mdoe
        elif key_input == ord('f'):
            self.MODE = 'f' # free camera mode
        
        if self.MODE == 's':
            if ord('0') <= key_input <= ord('9'):
                self.sign_to_save_landmarks_for = int(chr(key_input))
        else:
            self.sign_to_save_landmarks_for = None
    
    
    def set_status_text(self, image, sign_labels, list_of_counted_signs):
        if self.MODE == 's':
            if self.sign_to_save_landmarks_for is not None and list_of_counted_signs is not None:
                if 0 <= self.sign_to_save_landmarks_for < len(sign_labels):
                    cv.putText(image, f'Saving landmarks for sign: {sign_labels[self.sign_to_save_landmarks_for]}'
                               + f'({list_of_counted_signs[self.sign_to_save_landmarks_for]})',
                               (10, image.shape[0] - 15), cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
                else:
                    cv.putText(image, 'Invalid sign', (10, image.shape[0] - 15), # FIXME maybe put these in the middle ???
                            cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
            else:
                cv.putText(image, 'Saving landmarks mode', (10, image.shape[0] - 15),
                            cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
        elif self.MODE == 'd':
            cv.putText(image, 'Detect mode', (10, image.shape[0] - 15),
                        cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
        else:
            cv.putText(image, 'Free camera mode', (10, image.shape[0] - 15),
                        cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
        
        return image
    
    
    def draw_rectangle(self, image, hands, landmarks_dict, prediction, status):
        if status.MODE == 'd':
            min_x, min_y, max_x, max_y = hands.find_min_and_max_for_x_and_y(landmarks_dict)
            cv.rectangle(image,
                        (int(min_x * image.shape[1] - 10), int(min_y * image.shape[0] - 10)), # TODO: why * image.shape[1] and * image.shape[0] and not reverse ???
                        (int((max_x) * image.shape[1] + 10), int((max_y) * image.shape[0] + 10)),
                        (153,0,153), 2)
            
            label, accuracy = hands.sign_labels[np.argmax(prediction)], np.max(prediction)
            cv.putText(image, f'{label} ({accuracy:.2f})', (int(min_x * image.shape[1]), int(min_y * image.shape[0]) - 15),
                    cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
            
            # TODO: do something if there is no sign DETECTED
            
        return image

In [60]:
from keras.models import load_model

In [61]:
class Model:
    def __init__(self, model_path) -> None:
        self.model = load_model(model_path)
    
    def make_prediction(self, normalized_landmarks_list, status, hands):
        if status.MODE == 'd':
            prediction = self.model.predict(np.array([normalized_landmarks_list]))
            # print(hands.sign_labels[np.argmax(prediction)], np.max(prediction))
            return prediction

In [62]:
status = Status()

hands = HandDetector(use_static_image=True,
                     detection_confidence=0.5,
                     tracking_confidence=0.5,
                     num_of_hands=2, # may wanna make this work with only one hand first
                     sign_labels_file_path='.\sign_labels.csv',
                     data_set_file_path='.\data_set_test.csv')

model = Model('.\models\model.h5')

In [63]:
cap = cv.VideoCapture(0)

while True:
    _, frame = cap.read()
    
    key_input = cv.waitKey(10)

    if key_input == ord('q'):
        break
    
    status.set_status_mode(key_input)
    frame, mediapipe_results = hands.mediapipe_detect(frame)
    list_of_counted_signs = hands.count_number_of_saved_landmark()
    frame = status.set_status_text(frame, hands.sign_labels, list_of_counted_signs)
    
    if mediapipe_results.multi_hand_landmarks is not None:
        
        # make a dict of the basic coordinates of the landmarks
        landmarks_dict = hands.get_landmarks_as_dict(mediapipe_results) # values between 0 and 1, starting from the top left corner
        
        # normalize the landmarks to the local(relative) axis of the hand
        normalized_landmarks = hands.normalize_landmarks_to_local_axis(landmarks_dict)

        # make landmarks visible
        frame = hands.draw_landmarks(frame, mediapipe_results)

        # convert normalized landmarks to list
        normalized_landmarks_list = hands.convert_normalized_landmarks_to_list(normalized_landmarks)
        
        # save the last landmarks to a csv file
        hands.save_landmarks_to_csv_file(normalized_landmarks_list, key_input, status)
        
        # make a prediction
        prediction = model.make_prediction(normalized_landmarks_list, status, hands)
        
        frame = status.draw_rectangle(frame, hands, landmarks_dict, prediction, status)

    cv.imshow('App', frame) # SIRS: Sistem Inteligent de Recunoastere a Semnelor ???

cap.release()
cv.destroyAllWindows()



## Functions that I may need

In [64]:
    # take into consideration that coord start at top left with (0,0)
    # MAX COORDINATE or the one that is furthest from the wrist landmark ???????????
    # def get_max_coordinates_from_landmarks(self, landmarks_list):
    #     max_coordinates = [1, 1, 1]
    #     for landmark in landmarks_list:
    #         max_coordinates[0] = min(max_coordinates[0], landmark['x'])
    #         max_coordinates[1] = min(max_coordinates[1], landmark['y'])
    #         max_coordinates[2] = min(max_coordinates[2], landmark['z'])
        
    #     return max_coordinates


    # def get_index_for_max_y_coordinate(self, landmarks_list):
    #     max_y = 1
    #     index = 0

    #     for i, landmark in enumerate(landmarks_list):
    #         if landmark['y'] < max_y: # abs ????
    #             max_y = landmark['y']
    #             index = i
        
    #     return index
    
    # def draw_coordinates(self, image, landmarks_list):
    #     wrist = landmarks_list[0]
    #     heighest_tip = landmarks_list[self.get_index_for_max_y_coordinate(landmarks_list)]
        
    #     wristXY = [wrist['x'], wrist['y']]
    #     heighest_tipXY = [heighest_tip['x'], heighest_tip['y']]
        
    #     cv.circle(image, (int(wristXY[0] * image.shape[1]), int(wristXY[1] * image.shape[0])),
    #               5, (0, 0, 255), cv.FILLED)
    #     cv.circle(image, (int(heighest_tipXY[0] * image.shape[1]), int(heighest_tipXY[1] * image.shape[0])),
    #               5, (0, 0, 255), cv.FILLED)
        
    #     # write near the landmarks the coordinates
    #     cv.putText(image, f'{round(wristXY[0], 2)}, {round(wristXY[1], 2)}',
    #                (int(wristXY[0] * image.shape[1]), int(wristXY[1] * image.shape[0])),
    #                cv.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
    #     cv.putText(image, f'{round(heighest_tipXY[0], 2)}, {round(heighest_tipXY[1], 2)}',
    #                (int(heighest_tipXY[0] * image.shape[1]), int(heighest_tipXY[1] * image.shape[0])),
    #                cv.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
        
    #     return image


    # def get_index_for_max_y_normalized_coordinate(self, normalized_landmarks):
    #     max_y = 0 # change this ?
    #     index = 0

    #     for i, landmark in enumerate(normalized_landmarks):
    #         if abs(landmark['y']) > max_y:
    #             max_y = abs(landmark['y'])
    #             index = i
        
    #     return index


    # def draw_normalized_coordinates(self, image, normalized_landmarks, landmarks_list):
    #     wrist_local = normalized_landmarks[0]
    #     wristXY_local = [wrist_local['x'], wrist_local['y']]
    #     heighest_tip_local = normalized_landmarks[self.get_index_for_max_y_normalized_coordinate(normalized_landmarks)]
    #     # heighest_tip_local = normalized_landmarks[12]
    #     heighest_tipXY_local = [heighest_tip_local['x'], heighest_tip_local['y']]
        
    #     wrist_global = landmarks_list[0]
    #     wristXY_global = [wrist_global['x'], wrist_global['y']]
    #     heighest_tip_global = landmarks_list[self.get_index_for_max_y_coordinate(landmarks_list)]
    #     # heighest_tip_global = landmarks_list[12]
    #     heighest_tipXY_global = [heighest_tip_global['x'], heighest_tip_global['y']]
        
    #     # highlight the wrist and the heighest tip
    #     cv.circle(image, (int(wristXY_global[0] * image.shape[1]), int(wristXY_global[1] * image.shape[0])),
    #               5, (0, 0, 255), cv.FILLED)
    #     cv.circle(image, (int(heighest_tipXY_global[0] * image.shape[1]), int(heighest_tipXY_global[1] * image.shape[0])),
    #               5, (0, 0, 255), cv.FILLED)
        
    #     # write near the landmarks the coordinates
    #     cv.putText(image, f'{round(wristXY_local[0], 2)}, {round(wristXY_local[1], 2)}',
    #                (int(wristXY_global[0] * image.shape[1]), int(wristXY_global[1] * image.shape[0])),
    #                cv.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
    #     cv.putText(image, f'{round(heighest_tipXY_local[0], 2)}, {round(heighest_tipXY_local[1], 2)}',
    #                (int(heighest_tipXY_global[0] * image.shape[1]),int(heighest_tipXY_global[1] * image.shape[0])),
    #                cv.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
    
    #     return image


    # def print_info(self, mediapipe_results):
    #     print("max coord: " + str(self.get_max_coordinates_from_landmarks(mediapipe_results)))
    #     # (0,0)------------------------->(1,0)
    #     # |                                  |
    #     # |                                  |
    #     # |                                  |
    #     # |                                  |
    #     # (0,1)------------------------->(1,1)