In [None]:
%pip install opencv-python numpy openvino-dev==2023.3.0 matplotlib ipywidgets pillow tkinter

In [None]:
import os
import cv2
import numpy as np
import time
import openvino as ov
from openvino import Core, PartialShape
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
import ipywidgets as widgets
import tkinter as tk
from tkinter import Tk, Label, Button, Frame
from PIL import Image, ImageTk


def inspect_model(model_path):
    #Inspect a model's inputs and outputs
    core = Core()
    try:
        model = core.read_model(model_path)
        
        compiled_model = core.compile_model(model, "CPU", {"PERFORMANCE_HINT": "THROUGHPUT"})
        
        print(f"Model: {os.path.basename(model_path)}")
        print("\nInputs:")
        for i, input_node in enumerate(compiled_model.inputs):
            print(f"  [{i}] Name: {input_node.get_any_name()}, partial shape: {input_node.partial_shape}, Type: {input_node.element_type}")
        
        print("\nOutputs:")
        for i, output_node in enumerate(compiled_model.outputs):
            print(f"  [{i}] Name: {output_node.get_any_name()}, partial shape: {input_node.partial_shape}, Type: {output_node.element_type}")
            
        return compiled_model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Inspect each model
model_dir = './models'
models = {
    "hand_detector": os.path.join(model_dir, "hand_detector.xml"),
    "landmark_detector": os.path.join(model_dir, "hand_landmarks_detector.xml"),
    "gesture_embedder": os.path.join(model_dir, "gesture_embedder.xml"),
    "gesture_classifier": os.path.join(model_dir, "canned_gesture_classifier.xml")
}



for name, path in models.items():
    print(f"\n{'='*50}\nInspecting {name}\n{'='*50}")
    inspect_model(path)



Inspecting hand_detector
Model: hand_detector.xml

Inputs:
  [0] Name: input_1, partial shape: [1,192,192,3], Type: <Type: 'float32'>

Outputs:
  [0] Name: Identity, partial shape: [1,192,192,3], Type: <Type: 'float32'>
  [1] Name: Identity_1, partial shape: [1,192,192,3], Type: <Type: 'float32'>

Inspecting landmark_detector
Model: hand_landmarks_detector.xml

Inputs:
  [0] Name: input_1, partial shape: [1,224,224,3], Type: <Type: 'float32'>

Outputs:
  [0] Name: Identity, partial shape: [1,224,224,3], Type: <Type: 'float32'>
  [1] Name: Identity_1, partial shape: [1,224,224,3], Type: <Type: 'float32'>
  [2] Name: Identity_2, partial shape: [1,224,224,3], Type: <Type: 'float32'>
  [3] Name: Identity_3, partial shape: [1,224,224,3], Type: <Type: 'float32'>

Inspecting gesture_embedder
Model: gesture_embedder.xml

Inputs:
  [0] Name: hand, partial shape: [1,21,3], Type: <Type: 'float32'>
  [1] Name: handedness, partial shape: [1,1], Type: <Type: 'float32'>
  [2] Name: world_hand, parti

In [8]:
import cv2
import numpy as np

def draw_landmarks(image, landmarks):
    # Draw landmarks
    h, w = image.shape[:2]
    for i in range(0, len(landmarks), 3):
        x_norm = landmarks[i]
        y_norm = landmarks[i+1]
        x = int(x_norm * w)
        y = int(y_norm * h)
        cv2.circle(image, (x, y), 5, (0, 255, 0), -1)
    return image

class GestureRecognizer:
    def __init__(self, model_dir):
        #Initialize gesture recognizer
        self.core = Core()
        
        # Model paths
        self.models = {
            "hand_detector": os.path.join(model_dir, "hand_detector.xml"),
            "landmark_detector": os.path.join(model_dir, "hand_landmarks_detector.xml"),
            "gesture_embedder": os.path.join(model_dir, "gesture_embedder.xml"),
            "gesture_classifier": os.path.join(model_dir, "canned_gesture_classifier.xml")
        }
        
        # Load hand detector
        try:
            self.hand_detector = self.core.compile_model(self.models["hand_detector"], "CPU")
            self.hand_input = self.hand_detector.input(0)
            self.hand_outputs = []
            for i in range(len(self.hand_detector.outputs)):
                self.hand_outputs.append(self.hand_detector.output(i))
            print("Hand Detector Model Loaded Successfully!")
            
            # Load landmark detector
            self.landmark_detector = self.core.compile_model(self.models["landmark_detector"], "CPU")
            
            self.landmark_input = self.landmark_detector.input(0)
            self.landmark_outputs = []
            for i in range(len(self.landmark_detector.outputs)):
                self.landmark_outputs.append(self.landmark_detector.output(i))
            print("Landmark Detector Model Loaded Successfully!")
            
            # Load gesture embedder
            self.embedder = self.core.compile_model(self.models["gesture_embedder"], "CPU")
            self.embedder_output = self.embedder.output(0)
            print("Gesture Embedder Model Loaded Successfully!")
            
            # Load gesture classifier
            self.classifier = self.core.compile_model(self.models["gesture_classifier"], "CPU")
            self.classifier_input = self.classifier.input(0)
            self.classifier_output = self.classifier.output(0)
            print("Gesture Classifier Model Loaded Successfully!")
            
        except Exception as e:
            print(f"Error loading models: {e}")
            raise
        
        self.gesture_labels = ["Unknown", "Closed_Fist", "Open_Palm", "Pointing_Up", "Thumb_Down", "Thumb_Up", "Victory", "ILoveYou"]
        
        # Smoothing predictions
        self.gesture_history = []
        self.history_size = 5
    
    def preprocess_image(self, frame, input_shape = (1,192,192,3)):
        #Preprocess image
        if frame is None or frame.size == 0:
            return None
            
        h, w = int(input_shape[1]), int(input_shape[2])
        resized = cv2.resize(frame, (w, h))
        tensor = np.expand_dims(resized, axis=0).astype(np.float32)
        tensor = tensor / 255.0
        
        return tensor

    
    def detect_landmarks(self, frame, hand_bbox):
        #Detect hand landmarks
        if frame is None or hand_bbox is None:
            return None

        try:
            x_min, y_min, x_max, y_max = hand_bbox
            
            # Validate coordinates
            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(frame.shape[1], x_max), min(frame.shape[0], y_max)
            
            # Extract hand region
            hand_roi = frame[y_min:y_max, x_min:x_max]
            
            if hand_roi.size == 0:
                print("Empty hand ROI")
                return None

            input_tensor = self.preprocess_image(hand_roi, (1,224,224,3))
            
            # Run inference
            results = self.landmark_detector([input_tensor])
            
            print("Landmark detector output shapes:")
            for i, output in enumerate(self.landmark_outputs):
                print(f"  Output {i}: {results[output].shape}")
        
            
            try:
                hand_landmarks_flat = np.squeeze(results[self.landmark_outputs[0]])
                world_landmarks_flat = np.squeeze(results[self.landmark_outputs[3]])
                handedness = results[self.landmark_outputs[1]]
                
                print(f"Hand landmarks shape after squeeze: {hand_landmarks_flat.shape}")
                print(f"World landmarks shape after squeeze: {world_landmarks_flat.shape}")
                
                if hand_landmarks_flat.size == 63:
                    hand_landmarks = hand_landmarks_flat.reshape(21, 3)
                else:
                    print(f"Wrong size for hand landmarks: {hand_landmarks_flat.size}")
                    return None, None, None
                    
                if world_landmarks_flat.size == 63:
                    world_landmarks = world_landmarks_flat.reshape(21, 3)
                else:
                    print(f"Wrong size for world landmarks: {world_landmarks_flat.size}")
                    return None, None, None
                    
            except Exception as reshape_error:
                print(f"Error reshaping landmarks: {reshape_error}")
                return None, None, None
            

            roi_height, roi_width = hand_roi.shape[:2]
            hand_landmarks[:, 0] = hand_landmarks[:, 0] * roi_width + x_min
            hand_landmarks[:, 1] = hand_landmarks[:, 1] * roi_height + y_min

            

            print("Hand landmarks (pixel coordinates):")
            for i, (x, y, z) in enumerate(hand_landmarks):
                print(f"  Landmark {i}: ({x:.2f}, {y:.2f}, {z:.2f})")

            return hand_landmarks, world_landmarks, handedness
                
        except Exception as e:
            print(f"Error in landmark detection: {e}")
            import traceback
            traceback.print_exc()
            return None, None, None

    
    def recognize_gesture(self, hand_landmarks, world_landmarks, handedness):
        #Classify gestures
        try:
            if hand_landmarks is None:
                return "Unknown"
                
            hand_tensor = np.expand_dims(hand_landmarks, axis=0).astype(np.float32)
            handedness_tensor = handedness.astype(np.float32)
            world_tensor = np.expand_dims(world_landmarks, axis=0).astype(np.float32)

            print(f"Embedder input shapes:")
            print(f"  hand_landmarks: {hand_tensor.shape}")
            print(f"  handedness: {handedness_tensor.shape}")
            print(f"  world_landmarks: {world_tensor.shape}")

            inputs = {
            self.embedder.input(0): hand_tensor,
            self.embedder.input(1): handedness_tensor,
            self.embedder.input(2): world_tensor
            }

            
            embedding = self.embedder(inputs)[self.embedder_output]
            
            # Classify embedding
            classifier_output = self.classifier([embedding])[self.classifier_output]
            
            # Get predicted gesture
            gesture_id = np.argmax(classifier_output[0])
            confidence = float(classifier_output[0][gesture_id])

            print(f"Predicted gesture: {gesture_id} (confidence: {confidence:.2f})")
            
            print("Gesture confidences:")
            for i, conf in enumerate(classifier_output[0]):
                print(f"  Gesture {i}: {conf:.2f}")
        
            # Smoothing
            if confidence > 0.5:
                self.gesture_history.append(gesture_id)
                if len(self.gesture_history) > self.history_size:
                    self.gesture_history.pop(0)
                
                if self.gesture_history:
                    counts = np.bincount(self.gesture_history)
                    smoothed_id = np.argmax(counts)
                    
                    if smoothed_id < len(self.gesture_labels):
                        return self.gesture_labels[smoothed_id]
            
            return "Unknown"
            
        except Exception as e:
            print(f"Error in gesture recognition: {e}")
            import traceback
            traceback.print_exc()
            return "Error"

    
    def detect_hands(self, frame):
        #Detect hands
        if frame is None:
            return []
        
        # Preprocess
        input_tensor = self.preprocess_image(frame, (1, 192, 192, 3))
        if input_tensor is None:
            print("Preprocessing failed")
            return []
        
        # Inference
        results = self.hand_detector(input_tensor)
        
        boxes = results[self.hand_outputs[0]]
        scores = results[self.hand_outputs[1]]
        
        print(f"Hand detector output shapes: boxes: {boxes.shape}, scores: {scores.shape}")
        # Process results
        hands = []
        frame_h, frame_w = frame.shape[:2]
        
        flat_scores = scores.flatten()
        top_indices = np.argsort(flat_scores)[-5:]  # Top 5 detections
        
        for idx in top_indices:
            score = flat_scores[idx]
            if score > 0.5:  # Confidence threshold
                box = boxes[0, idx]
                
                x_coords = box[0::2]
                y_coords = box[1::2]
                
                x_coords_pixels = x_coords * frame_w
                y_coords_pixels = y_coords * frame_h

                # Debug prints
                print(f"Normalized x_coords: {x_coords[:5]}")
                print(f"Pixel x_coords: {x_coords_pixels[:5]}")

                print(f"Normalized y_coords: {y_coords[:5]}")
                print(f"Pixel y_coords: {y_coords_pixels[:5]}")

                
                x_min = max(0, int(np.min(x_coords_pixels)))
                y_min = max(0, int(np.min(y_coords_pixels)))
                x_max = min(frame_w, int(np.max(x_coords_pixels)))
                y_max = min(frame_h, int(np.max(y_coords_pixels)))
                
                # Padding
                padding = 20
                x_min = max(0, x_min - padding)
                y_min = max(0, y_min - padding)
                x_max = min(frame_w, x_max + padding)
                y_max = min(frame_h, y_max + padding)
                
                if x_max > x_min and y_max > y_min:
                    hands.append({
                        "bbox": (x_min, y_min, x_max, y_max),
                        "score": float(score),
                        "landmarks": list(zip(x_coords_pixels, y_coords_pixels))
                    })

        print(f"Detected hands: {len(hands)}")
        for i, hand in enumerate(hands):
            print(f"  Hand {i}: bbox={hand['bbox']}, score={hand['score']}")
    
        return hands
            
    def process_frame(self, frame):
        #Process frame
        if frame is None:
            return frame, "None"
        
        output_frame = frame.copy()
        
        try:
            hands = self.detect_hands(frame)
            
            if not hands:
                print("No hands detected")
                return output_frame, "No Hand"
                
            hand = max(hands, key=lambda x: x["score"])
            
            x_min, y_min, x_max, y_max = hand["bbox"]
            cv2.rectangle(output_frame, (x_min, y_min), (x_max, y_max), 
                        (0, 255, 0), 2)
            
            for i, (x, y) in enumerate(hand["landmarks"]):
                cv2.circle(output_frame, (int(x), int(y)), 5, (0, 0, 255), -1)
            
            result = self.detect_landmarks(frame, hand["bbox"])
            
            gesture = "Hand Detected"
            
            if result is not None:
                hand_landmarks, world_landmarks, handedness = result
                
                if hand_landmarks is not None and world_landmarks is not None and handedness is not None:
                    for i, point in enumerate(hand_landmarks):
                        x, y = int(point[0]), int(point[1])
                        cv2.circle(output_frame, (x, y), 3, (0, 255, 255), -1)
                    
                    gesture = self.recognize_gesture(hand_landmarks, world_landmarks, handedness)
                    output_frame = draw_landmarks(output_frame, hand_landmarks.flatten())
            
            cv2.putText(output_frame, f"Gesture: {gesture}", (x_min, y_min - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            
            return output_frame, gesture
                
        except Exception as e:
            print(f"Error processing frame: {str(e)}")
            cv2.putText(output_frame, f"Error: {str(e)[:30]}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
            return output_frame, "Error"

In [9]:
import tkinter as tk

class SimpleCameraGUI:
    def __init__(self, recognizer):
        #Initialize GUI
        self.recognizer = recognizer
        self.root = tk.Tk()
        self.root.title("Gesture Recognition Demo")
        self.root.geometry("800x900")
        
        # Camera feed frame
        self.video_frame = tk.Frame(self.root, width=640, height=480)
        self.video_frame.pack(pady=20)
        
        # Video label
        self.video_label = tk.Label(self.video_frame)
        self.video_label.pack()
        
        # Control frame
        self.control_frame = tk.Frame(self.root)
        self.control_frame.pack(pady=10)
        
        # Start/stop button
        self.camera_button = tk.Button(self.control_frame, text="Start Camera", 
                                     command=self.toggle_camera)
        self.camera_button.pack(side=tk.LEFT, padx=10)
        
        # Exit button
        self.exit_button = tk.Button(self.control_frame, text="Exit", 
                                    command=self.root.destroy)
        self.exit_button.pack(side=tk.LEFT, padx=10)
        
        # Status label
        self.status_label = tk.Label(self.root, text="Ready", font=("Arial", 14))
        self.status_label.pack(pady=10)
        
        # Camera variables
        self.cap = None
        self.is_running = False
    
    def toggle_camera(self):
        #Toggle camera
        if not self.is_running:
            self.start_camera()
        else:
            self.stop_camera()
    
    def start_camera(self):
        #Start camera feed
        self.cap = cv2.VideoCapture(0)
        if not self.cap.isOpened():
            self.status_label.config(text="Error: Could not open camera")
            return
            
        self.is_running = True
        self.camera_button.config(text="Stop Camera")
        self.status_label.config(text="Camera running")
        self.update_frame()
    
    def stop_camera(self):
        """Stop camera feed"""
        self.is_running = False
        if self.cap:
            self.cap.release()
            self.cap = None
        self.camera_button.config(text="Start Camera")
        self.status_label.config(text="Camera stopped")
    
    def update_frame(self):
        """Update video frame"""
        if self.is_running and self.cap:
            ret, frame = self.cap.read()
            if ret:
                # Process frame
                processed_frame, gesture = self.recognizer.process_frame(frame)
                
                # Update status
                self.status_label.config(text=f"Detected: {gesture}")
                
                rgb_frame = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)
                pil_img = Image.fromarray(rgb_frame)
                img_tk = ImageTk.PhotoImage(image=pil_img)
                
                self.video_label.img_tk = img_tk  # Keep reference
                self.video_label.config(image=img_tk)
                
                self.root.after(33, self.update_frame)  # ~30fps
            else:
                self.status_label.config(text="Error reading from camera")
                self.stop_camera()
    
    def run(self):
        """Run application"""
        self.root.mainloop()
        # Clean up
        self.stop_camera()

In [10]:
try:
    print("Initializing gesture recognizer...")
    recognizer = GestureRecognizer('./models')
    
    print("\nStarting camera interface...")
    gui = SimpleCameraGUI(recognizer)
    gui.run()
    
except Exception as e:
    print(f"\nFATAL ERROR: {str(e)}")
    import traceback
    traceback.print_exc()

Initializing gesture recognizer...
Hand Detector Model Loaded Successfully!
Landmark Detector Model Loaded Successfully!
Gesture Embedder Model Loaded Successfully!
Gesture Classifier Model Loaded Successfully!

Starting camera interface...
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Hand detector output shapes: boxes: (1, 2016, 18), scores: (1, 2016, 1)
Detected hands: 0
No hands detected
Han

Traceback (most recent call last):
  File "/tmp/ipykernel_146888/986801360.py", line 7, in <module>
    gui.run()
  File "/tmp/ipykernel_146888/2052317557.py", line 96, in run
    self.stop_camera()
  File "/tmp/ipykernel_146888/2052317557.py", line 66, in stop_camera
    self.camera_button.config(text="Start Camera")
  File "/usr/lib/python3.12/tkinter/__init__.py", line 1721, in configure
    return self._configure('configure', cnf, kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/tkinter/__init__.py", line 1711, in _configure
    self.tk.call(_flatten((self._w, cmd)) + self._options(cnf))
_tkinter.TclError: invalid command name ".!frame2.!button"
