In [1]:
import cv2
import mediapipe as mp

from threading import Thread

from model import TorchVisionModel
from ssd_mobilenetv3 import SSDMobilenet
import numpy as np
import torch
from PIL import Image, ImageOps
from torch import Tensor
from torchvision.transforms import functional as f
import os
from typing import Optional, Tuple

import time



# Basic mp_pose definition
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose

# Basic mp_hands definition
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

# Global Variables
flag_wait = 0
pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)
hands = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5)

# Pose Detection Function

def Pose_detection(result,index):
    # Detect
    results = pose.process(image)
    #print(dir(mp_pose.PoseLandmark))
    #if(results.pose_landmarks):
        #print(results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_ELBOW])
    
    #Return draw params
    result[index] = [results.pose_landmarks,mp_pose.POSE_CONNECTIONS,mp_drawing_styles.get_default_pose_landmarks_style()]
    
    #Wait for race (Doesn't wait)
    #if(flag_wait == 2):
    #    flag_wait = 0
    #flag_wait += 1

    
# Hands Detection function     
def Hands_detection(result,index):
    #Detect
    results = hands.process(image)

    #if(results.multi_hand_landmarks):
        #print(results.multi_hand_landmarks[0].landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP])

    result[index] = [results.multi_hand_landmarks,mp_hands.HAND_CONNECTIONS]
        
    #Draw
    
    #Wait for race (Doesn't wait)
    #if(flag_wait == 3):
    #    flag_wait = 0
    #flag_wait += 1

# Classification Detection function    
targets = {
    1: "call",
    2: "dislike",
    3: "fist",
    4: "four",
    5: "like",
    6: "mute",
    7: "ok",
    8: "one",
    9: "palm",
    10: "peace",
    11: "rock",
    12: "stop",
    13: "stop inverted",
    14: "three",
    15: "two up",
    16: "two up inverted",
    17: "three2",
    18: "peace inverted",
    19: "no gesture"
}
COLOR = (0, 255, 0)
FONT = cv2.FONT_HERSHEY_SIMPLEX

class Demo:

    @staticmethod
    def preprocess(img: np.ndarray) -> Tuple[Tensor, Tuple[int, int], Tuple[int, int]]:
        """
        Preproc image for model input
        Parameters
        ----------
        img: np.ndarray
            input image
        """
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(img)
        width, height = image.size

        image = ImageOps.pad(image, (max(width, height), max(width, height)))
        padded_width, padded_height = image.size
        image = image.resize((320, 320))

        img_tensor = f.pil_to_tensor(image)
        img_tensor = f.convert_image_dtype(img_tensor)
        img_tensor = img_tensor[None, :, :, :]
        return img_tensor, (width, height), (padded_width, padded_height)

    @staticmethod
    def run(detector: TorchVisionModel, num_hands: int = 2, threshold: float = 0.5, landmarks: bool = False, frame: str = None) -> None:
        """
        if landmarks:
            hands = mp.solutions.hands.Hands(
                model_complexity=0,
                static_image_mode=False,
                max_num_hands=2,
                min_detection_confidence=0.8)
        """
        
        
        #frame = cv2.imread(image_path)    
        #print(type(frame))
        processed_frame, size, padded_size = Demo.preprocess(frame)
        
        with torch.no_grad():
            output = detector(processed_frame)[0]
        boxes = output["boxes"][:num_hands]
        scores = output["scores"][:num_hands]
        labels = output["labels"][:num_hands]
        
        
        # Processing the data   
        result = ["NO GESTURES"]
        count = 0
        #print(scores)
        for i in range(min(num_hands, len(boxes))):
            if scores[i] > threshold:
                count += 1
                result = [targets[int(labels[i])], scores[i]]        
        
        #sprint(count)        
        return result

def _load_model(model_path: str, device: str) -> TorchVisionModel:
    ssd_mobilenet = SSDMobilenet(num_classes=len(targets) + 1)
    if not os.path.exists(model_path):
        logging.info(f"Model not found {model_path}")
        raise FileNotFoundError

    ssd_mobilenet.load_state_dict(model_path, map_location=device)
    ssd_mobilenet.eval()
    return ssd_mobilenet

model = _load_model(os.path.expanduser("SSDLite.pth"),"cpu")


def Classification(image,result,index):
    # Classification Result
    result= Demo.run(model, num_hands=100, threshold=0.75, landmarks=False,frame=image)
    if(results[0] != "NO GESTURES"):
        print(result[0])
    
    #Wait for race (Doesn't wait)
    #if(flag_wait == 3):
    #    flag_wait = 0
    #flag_wait += 1
    
cap = cv2.VideoCapture(0)
while cap.isOpened():
    success, image = cap.read()
    image.flags.writeable = True
    
    # Store Threads
    start = time.time()
    thread_list = [None] * 3
    results = [None] * 3
    
    # Define Threads
    thread_list[0] = Thread(target=Pose_detection, args=(results, 0,))
    thread_list[1] = Thread(target=Hands_detection, args=(results, 1,))
    thread_list[2] = Thread(target=Classification, args=(image,results,2,))
    
    # Start Threads
    thread_list[0].start()
    thread_list[1].start()
    thread_list[2].start()
    
    # Join only Detectors
    thread_list[0].join()
    thread_list[1].join()
    end1 = time.time()
    #print("1. "+str(end1-start))
    
    # Draw Image
    start2 = time.time()
    if results[0][0]:
        mp_drawing.draw_landmarks(
            image,
            results[0][0],
            results[0][1],
            landmark_drawing_spec=results[0][2])
    
    if results[1][0]:
        for num, hand in enumerate(results[1][0]):
            mp_drawing.draw_landmarks(image, hand, results[1][1], 
                                    mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=4),
                                    mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=4),
                                     )
    
    #print("90909090909")
    
    # Display
    cv2.imshow('Hand Tracking', image)
    end2 = time.time()
    #print("2. "+str(end2-start2))
    
    # Join classifier thread
    thread_list[2].join()
    
    key = cv2.waitKey(5) & 0xFF
    #print(key)
    if (key == 27 or key == ord('q') or key == ord('x') or key == ord("c")):
        cap.release()
        cv2.destroyAllWindows()
        sys.exit(0)


ModuleNotFoundError: No module named 'mediapipe'