## Data Manipulation Class

In [10]:
import cv2 as cv
import numpy as np
import mediapipe as mp

In [11]:
# TODO:
# - make it abble to save both hands landmarks(if one is not detected, save 0.0, 0.0, 0.0 for that hand)
class HandDetector:
    def __init__(self, use_static_image, detection_confidence, tracking_confidence, num_of_hands, sign_labels_file_path, data_set_file_path):
        # useful things
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.data_set_file_path = data_set_file_path
        self.sign_labels_file_path = sign_labels_file_path
        self.sign_labels = self.get_sign_labels() # list of sign labels

        # mediapipe model
        self.model = self.mp_hands.Hands(
            static_image_mode=use_static_image,
            min_detection_confidence=detection_confidence,
            min_tracking_confidence=tracking_confidence,
            max_num_hands=num_of_hands
        )


    def mediapipe_detect(self, image):
        image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        image.flags.writeable = False
        mediapipe_results = self.model.process(image)
        image.flags.writeable = True
        image = cv.cvtColor(image, cv.COLOR_RGB2BGR)

        return image, mediapipe_results


    def draw_landmarks(self, image, mediapipe_results):
        if mediapipe_results.multi_hand_landmarks:
            for hand_landmarks in mediapipe_results.multi_hand_landmarks:
                self.mp_drawing.draw_landmarks(
                    image, hand_landmarks, self.mp_hands.HAND_CONNECTIONS,
                    self.mp_drawing.DrawingSpec(color=(153,0,153), thickness=2, circle_radius=1),
                    self.mp_drawing.DrawingSpec(color=(255,255,255), thickness=2, circle_radius=1)
                )

        return image


    def get_landmarks_as_dict(self, mediapipe_results):
        landmarks_dict = []
        if mediapipe_results.multi_hand_landmarks: # len = 1 or 2
            for hand_landmarks in mediapipe_results.multi_hand_landmarks:
                for landmark in hand_landmarks.landmark:
                    landmarks_dict.append({
                        'x': landmark.x,
                        'y': landmark.y
                        # 'z': landmark.z
                    })

        return landmarks_dict # len =  21 always(for one hand???)


    def convert_landmark_to_list(self, landmarks_dict):
        # wrist will be 0.0, 0.0, 0.0
        wrist = landmarks_dict[0]
        normalized_landmarks = [] # values between -1 and 1, wrist being 0, 0, 0, len = 21(for one hand)

        for landmark in landmarks_dict:
            normalized_landmarks.append({
                'x': landmark['x'] - wrist['x'],
                'y': landmark['y'] - wrist['y']
                # 'z': landmark['z'] - wrist['z']
            })

        return normalized_landmarks


    def normalize_landmarks(self, landmarks_dict):
        normalized_landmarks = self.convert_landmark_to_list(landmarks_dict)
        normalized_landmarks_list = []

        for landmark in normalized_landmarks:
            for key, _ in landmark.items():
                normalized_landmarks_list.append(landmark[key])

        return normalized_landmarks_list


    def save_landmarks_to_csv_file(self, normalized_landmarks_list, key_input, status):
        if status.MODE == 's':
            if status.selected_sub_list_sign is not None:
                status.set_real_list_index()
                if status.real_list_index < len(self.sign_labels):
                    if key_input == ord('c'):
                        # check if file exists
                        try:
                            with open(self.data_set_file_path, 'r') as file:
                                pass
                        except FileNotFoundError:
                            print(f'File : {self.data_set_file_path} not found.')
                            exit(0)

                        # add comma after each landmark
                        string_to_save = f"{status.real_list_index}," + ','.join(str(landmark) for landmark in normalized_landmarks_list)

                        # write the last landmark
                        with open(self.data_set_file_path, 'a') as file: # a = append
                            file.write(str(string_to_save) + '\n')
                        file.close()


    def get_sign_labels(self):
        # check if file exists
        try:
            with open(self.sign_labels_file_path, 'r') as file:
                pass
        except FileNotFoundError:
            print(f'File : {self.sign_labels_file_path} not found!')
            exit(1) # FIXME: maybe handle this better ?

        with open(self.sign_labels_file_path, 'r') as file:
            sign_labels = file.read().splitlines()
        file.close()

        return sign_labels


    def count_number_of_saved_landmarks(self):
        # check if file exists
        try:
            with open(self.data_set_file_path, 'r') as file:
                pass
        except FileNotFoundError:
            print(f'File : {self.data_set_file_path} not found while trying to count!')
            return None

        list_of_counted_signs = [0] * len(self.get_sign_labels())

        with open(self.data_set_file_path, 'r') as file:
            for line in file.readlines():
                list_of_counted_signs[int(line.split(',')[0])] += 1
        
        return list_of_counted_signs


    def find_min_and_max_for_x_and_y(self, landmarks_dict):
        min_x = min_y = 1
        max_x = max_y = 0

        for landmark in landmarks_dict:
            min_x = min(min_x, landmark['x'])
            min_y = min(min_y, landmark['y'])
            max_x = max(max_x, landmark['x'])
            max_y = max(max_y, landmark['y'])

        return min_x, min_y, max_x, max_y


    # make a list of sub-lists, each 10 elements
    # for easier sign selection while creating the data-set
    def reshape_sign_labels(self):
        return [self.sign_labels[i:i+10] for i in range(0, len(self.sign_labels), 10)]

In [12]:
hands = HandDetector(
    use_static_image=False,
    detection_confidence=0.5,
    tracking_confidence=0.5,
    num_of_hands=2,
    sign_labels_file_path='.\data\sign_labels_abc.csv',
    data_set_file_path='.\data\data_set_abc.csv'
    )

## Frame Status Class

In [13]:
# TODO:
# - make the functions simpler

class FrameStatus:
    def __init__(self) -> None:
        self.sub_lists_of_signs = []
        self.selected_sub_list_index = 0
        self.selected_sub_list_sign = None
        self.real_list_index = 0
        
        self.WORD = ''
        self.SENTENCE = ''
        self.MODE = None
        
        self.DOT = [.5, .5] # center of frame
        pass


    def set_status_mode(self, key_input):
        if key_input == ord('s'):
            self.MODE = 's' # save mode
            self.DOT = [0, 0]
        elif key_input == ord('d'):
            self.MODE = 'd' # detect mode
        elif key_input == ord('w'):
            self.MODE = 'w' # word mode
        elif key_input == ord('f'):
            self.MODE = 'f' # free camera mode
            self.DOT = [0, 0]


    def get_sign_from_key_input(self, key_input):
        if self.MODE == 's':
            if ord('0') <= key_input <= ord('9'):
                self.selected_sub_list_sign = int(chr(key_input))
        else:
            self.selected_sub_list_sign = None


    def move_between_sub_lists(self, key_input):
        if key_input == ord('>') and self.selected_sub_list_index < (len(self.sub_lists_of_signs) - 1):
            self.selected_sub_list_index += 1
            self.selected_sub_list_sign = 0
        if key_input == ord('<') and self.selected_sub_list_index > 0:
            self.selected_sub_list_index -= 1
            self.selected_sub_list_sign = 0


    def set_real_list_index(self):
        self.real_list_index = 10 * self.selected_sub_list_index + self.selected_sub_list_sign


    def set_status_text(self, image, key_input):
        list_of_signs_counted = hands.count_number_of_saved_landmarks()
        text = ''

        cv.rectangle(image, (0, 0), (image.shape[1], 35), (255,255,255), -1)
        if self.MODE == 'd':
            text = 'Detect mode'
        elif self.MODE == 'w':
            text = 'Word mode'
        elif self.MODE == 's':
            if self.selected_sub_list_sign is not None and list_of_signs_counted is not None:
                if 0 <= self.selected_sub_list_sign < len(self.sub_lists_of_signs[self.selected_sub_list_index]):
                    self.move_between_sub_lists(key_input=key_input)
                    self.set_real_list_index()
                    text = f'Saving landmarks for sign: {self.sub_lists_of_signs[self.selected_sub_list_index][self.selected_sub_list_sign]}' \
                           f'({list_of_signs_counted[self.real_list_index]}), page[{self.selected_sub_list_index + 1}/{len(self.sub_lists_of_signs)}]'
                else:
                    text = 'Invalid sign'
            else:
                text = 'Saving landmarks mode'
        else:
            text = 'Free camera mode'
        
        text_size = cv.getTextSize(text, cv.FONT_HERSHEY_SIMPLEX, 0.75, 2)[0]
        cv.putText(image, text, (int((image.shape[1] - text_size[0]) / 2), 25),
                   cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)

        return image
    
    
    # TODO: make it to be a MODE
    def move_dot(self, label):
        # move the dot according to the sign detected
        if label == self.hands.sign_labels[0] and self.DOT[1] >= 0.01:
            self.DOT[1] += -.01
        elif label == self.hands.sign_labels[1] and self.DOT[1] <= 0.99:
            self.DOT[1] += .01
        elif label == self.hands.sign_labels[2] and self.DOT[0] <= 0.99:
            self.DOT[0] += .01
        elif label == self.hands.sign_labels[3] and self.DOT[0] >= 0.01:
            self.DOT[0] += -.01
        
        cv.circle(image, (int(self.DOT[0] * image.shape[1]), int(self.DOT[1] * image.shape[0])), 5, (255,255,153), -1)
        
        return image


    def create_word_from_signs(self, label):
        if self.MODE == 'w':
            self.WORD += label


    def draw_rectangle_around_hand(self, image, landmarks_dict, prediction):
        if self.MODE == ('d' or 'w'):
            min_x, min_y, max_x, max_y = hands.find_min_and_max_for_x_and_y(landmarks_dict)
            cv.rectangle(image,
                        (int(min_x * image.shape[1] - 10), int(min_y * image.shape[0] - 10)), # TODO: why * image.shape[1] and * image.shape[0] and not reverse ???
                        (int(max_x * image.shape[1] + 10), int(max_y * image.shape[0] + 10)),
                        (153,0,153), 2)

            if np.max(prediction) > 0.8:
                label, accuracy = hands.sign_labels[np.argmax(prediction)], np.max(prediction)
                self.create_word_from_signs(label)
            else:
                label, accuracy = 'Unknown sign', np.prod(1 - prediction) # FIXME if I change how the probabilities are calculated, I need to change this too
            
            # display the created word # TODO make it to be a MODE, does the same as 'd' + word logic
            cv.putText(image, self.WORD, (10, 50), cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)
            # image = self.move_dot(label, image)

            # display the label and accuracy
            cv.putText(image, f'{label} ({accuracy:.2f})', (int(min_x * image.shape[1]), int(min_y * image.shape[0]) - 15),
                    cv.FONT_HERSHEY_SIMPLEX, 0.75, (153,0,153), 2, cv.LINE_AA)

        return image

In [14]:
status = FrameStatus()

status.sub_lists_of_signs = hands.reshape_sign_labels()

## Sign Detection Class

In [15]:
from keras.models import load_model

In [16]:
class Model:
    def __init__(self, model_path) -> None:
        self.model = load_model(model_path)


    def make_prediction(self, normalized_landmarks_list, status):
        prediction = np.zeros((1, 5))

        if status.MODE == ('d' or 'w'):
            prediction = self.model.predict(np.array([normalized_landmarks_list]))
        
        return prediction

In [17]:
model = Model('.\models\model18_abc_2.h5')

## Camera Code

In [18]:
cap = cv.VideoCapture(0)

while True:
    _, frame = cap.read()

    key_input = cv.waitKey(10)

    # quit camera
    if key_input == ord('q'):
        break
    
    status.set_status_mode(key_input)
    status.get_sign_from_key_input(key_input)
    frame, mediapipe_results = hands.mediapipe_detect(frame)
    frame = status.set_status_text(frame, key_input)

    if mediapipe_results.multi_hand_landmarks is not None:

        # make a dict of the basic coordinates of the landmarks
        landmarks_dict = hands.get_landmarks_as_dict(mediapipe_results) # values between 0 and 1, starting from the top left corner

        # make landmarks visible
        frame = hands.draw_landmarks(frame, mediapipe_results)

        # normalize landmarks to local(relative) axis and convert to 1d list
        normalized_landmarks_list = hands.normalize_landmarks(landmarks_dict)

        # save the last landmarks to a csv file
        hands.save_landmarks_to_csv_file(normalized_landmarks_list, key_input, status)

        # FIXME:crashes if there are 2 hands at the same time
        # make a prediction
        prediction = model.make_prediction(normalized_landmarks_list, status)

        # highlight the predicted sign
        frame = status.draw_rectangle_around_hand(frame, landmarks_dict, prediction)

    cv.imshow('App', frame) # SIRS: Sistem Inteligent de Recunoastere a Semnelor ???

cap.release()
cv.destroyAllWindows()

1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
1
2
3
4
