# Interactive Object Detection Debugging
Simplified notebook for testing detection models with interactive controls


In [None]:
# Setup and imports
import os
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import pipeline, AutoProcessor, AutoModelForZeroShotObjectDetection
from PIL import Image
import ipywidgets as widgets
from IPython.display import display, clear_output
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

%matplotlib inline


In [2]:
# Detection Models Setup
class DetectionManager:
    def __init__(self):
        self.detectors = {}
        self.available_models = []
        self.setup_models()
    
    def create_owl_detector(self, model_name):
        """Create OWL-ViT detector"""
        device = 0 if torch.cuda.is_available() else -1
        detector = pipeline(
            "zero-shot-object-detection",
            model=model_name,
            device=device,
            torch_dtype=torch.float16 if device >= 0 else torch.float32,
        )
        return detector
    
    def create_grounding_dino(self):
        """Create GroundingDINO detector"""
        try:
            processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
            model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
            
            def grounding_detect(image_path, candidate_labels):
                image = Image.open(image_path)
                text_prompt = " . ".join(candidate_labels[:15]) + " ."
                inputs = processor(images=image, text=text_prompt, return_tensors="pt")
                
                with torch.no_grad():
                    outputs = model(**inputs)
                
                # Get image size for post-processing
                target_sizes = torch.tensor([image.size[::-1]])
                
                # Fixed post-processing call - remove unsupported arguments
                try:
                    # Try with threshold arguments (newer versions)
                    results = processor.post_process_grounded_object_detection(
                        outputs, inputs.input_ids, target_sizes=target_sizes,
                        box_threshold=0.1, text_threshold=0.1
                    )[0]
                except TypeError:
                    # Fallback for older versions without threshold arguments
                    results = processor.post_process_grounded_object_detection(
                        outputs, inputs.input_ids, target_sizes=target_sizes
                    )[0]
                
                detections = []
                if "boxes" in results and "scores" in results and "labels" in results:
                    for box, score, label in zip(results["boxes"], results["scores"], results["labels"]):
                        # Apply manual threshold filtering
                        if float(score) >= 0.1:
                            # Handle both string labels and numeric indices
                            if isinstance(label, str):
                                # GroundingDINO returns text labels directly
                                label_text = label.strip()
                            else:
                                # Fallback to index-based lookup
                                label_idx = int(label)
                                if label_idx < len(candidate_labels):
                                    label_text = candidate_labels[label_idx]
                                else:
                                    label_text = "object"
                            
                            detections.append({
                                "box": {"xmin": float(box[0]), "ymin": float(box[1]), 
                                       "xmax": float(box[2]), "ymax": float(box[3])},
                                "score": float(score),
                                "label": label_text
                            })
                
                return detections
            
            return grounding_detect
        except Exception as e:
            print(f"GroundingDINO not available: {e}")
            return None
    
    def setup_models(self):
        """Setup all available models"""
        owl_models = [
            "google/owlvit-base-patch16",
            "google/owlvit-base-patch32", 
            "google/owlvit-large-patch14"
        ]
        
        for model_name in owl_models:
            try:
                self.detectors[model_name] = self.create_owl_detector(model_name)
                self.available_models.append(model_name)
                print(f"✅ Loaded {model_name}")
            except Exception as e:
                print(f"❌ Failed to load {model_name}: {e}")
        
        # Try GroundingDINO
        grounding_detector = self.create_grounding_dino()
        if grounding_detector:
            self.detectors["GroundingDINO"] = grounding_detector
            self.available_models.append("GroundingDINO")
            print("✅ Loaded GroundingDINO")
    
    def detect(self, model_name, image_path, search_terms):
        """Run detection with specified model"""
        if model_name not in self.detectors:
            raise ValueError(f"Model {model_name} not available")
        
        detector = self.detectors[model_name]
        
        if model_name == "GroundingDINO":
            return detector(image_path, search_terms)
        else:
            return detector(image_path, candidate_labels=search_terms)

# Initialize detection manager
detector_manager = DetectionManager()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


✅ Loaded google/owlvit-base-patch16


Device set to use cuda:0


✅ Loaded google/owlvit-base-patch32


Device set to use cuda:0


✅ Loaded google/owlvit-large-patch14
✅ Loaded GroundingDINO


In [3]:
# Visualization Functions
def visualize_detections(image, detections, title="Detections", confidence_threshold=0.1):
    """Visualize detections with bounding boxes"""
    img_vis = image.copy()
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), 
              (255, 0, 255), (0, 255, 255), (255, 128, 0), (128, 255, 0)]
    
    filtered_dets = [d for d in detections if d["score"] >= confidence_threshold]
    label_to_color = {}
    color_idx = 0
    
    for det in filtered_dets:
        label = det["label"]
        score = det["score"]
        box = det["box"]
        
        if label not in label_to_color:
            label_to_color[label] = colors[color_idx % len(colors)]
            color_idx += 1
        color = label_to_color[label]
        
        x1, y1, x2, y2 = int(box["xmin"]), int(box["ymin"]), int(box["xmax"]), int(box["ymax"])
        thickness = max(2, int(score * 8))
        cv2.rectangle(img_vis, (x1, y1), (x2, y2), color, thickness)
        
        text = f"{label} {score:.3f}"
        text_scale = 0.6
        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, text_scale, 2)[0]
        cv2.rectangle(img_vis, (x1, y1-text_size[1]-10), (x1+text_size[0]+10, y1), color, -1)
        cv2.putText(img_vis, text, (x1+5, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 
                   text_scale, (255, 255, 255), 2)
    
    img_rgb = cv2.cvtColor(img_vis, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(15, 10))
    plt.imshow(img_rgb)
    plt.title(f"{title} - {len(filtered_dets)} objects (threshold ≥ {confidence_threshold:.2f})")
    plt.axis('off')
    plt.show()
    
    return filtered_dets

def get_detection_summary(detections, confidence_threshold=0.1):
    """Get summary statistics of detections"""
    filtered = [d for d in detections if d["score"] >= confidence_threshold]
    if not filtered:
        return "No detections above threshold"
    
    label_counts = Counter([d['label'] for d in filtered])
    summary = f"Found {len(filtered)} detections:\n"
    for label, count in label_counts.most_common():
        avg_conf = np.mean([d['score'] for d in filtered if d['label'] == label])
        summary += f"  • {label}: {count} ({avg_conf:.3f} avg confidence)\n"
    
    return summary


In [4]:
# Interactive Detection Interface
class InteractiveDetector:
    def __init__(self, image_paths, search_terms):
        self.image_paths = image_paths
        self.search_terms = search_terms
        self.current_detections = {}
        self.current_model = None
        self.current_image_path = None
        
    def create_interface(self):
        """Create interactive widgets"""
        # Image selection dropdown
        image_options = [(os.path.basename(path), path) for path in self.image_paths]
        image_dropdown = widgets.Dropdown(
            options=image_options,
            value=self.image_paths[0],
            description='Image:',
            style={'description_width': 'initial'}
        )
        
        # Model selection dropdown
        model_dropdown = widgets.Dropdown(
            options=detector_manager.available_models,
            value=detector_manager.available_models[0],
            description='Model:',
            style={'description_width': 'initial'}
        )
        
        # Threshold slider
        threshold_slider = widgets.FloatSlider(
            value=0.15,
            min=0.01,
            max=1.0,
            step=0.01,
            description='Confidence Threshold:',
            readout_format='.3f',
            style={'description_width': 'initial'}
        )
        
        # Max detections slider
        max_detections_slider = widgets.IntSlider(
            value=50,
            min=1,
            max=200,
            step=1,
            description='Max Detections:',
            style={'description_width': 'initial'}
        )
        
        # Run button
        run_button = widgets.Button(
            description='🔍 Run Detection',
            button_style='success',
            layout=widgets.Layout(width='150px', height='40px')
        )
        
        # Output area
        output = widgets.Output()
        
        def run_detection(button):
            with output:
                clear_output(wait=True)
                
                image_path = image_dropdown.value
                model_name = model_dropdown.value
                threshold = threshold_slider.value
                max_dets = max_detections_slider.value
                
                print(f"🔍 Running {model_name} detection...")
                print(f"📁 Image: {os.path.basename(image_path)}")
                print(f"📊 Threshold: {threshold:.3f}, Max detections: {max_dets}")
                
                try:
                    # Load image
                    image = cv2.imread(image_path)
                    if image is None:
                        print(f"❌ Error: Could not load image {image_path}")
                        return
                    
                    # Run detection
                    detections = detector_manager.detect(model_name, image_path, self.search_terms)
                    self.current_detections[model_name] = detections
                    self.current_model = model_name
                    self.current_image_path = image_path
                    
                    # Visualize results
                    filtered_dets = visualize_detections(
                        image, detections, 
                        title=f"{model_name} Detection Results",
                        confidence_threshold=threshold
                    )
                    
                    # Show summary
                    print("📈 Detection Summary:")
                    print(get_detection_summary(detections, threshold))
                    
                    # Show detailed results if not too many
                    if len(filtered_dets) <= 20:
                        print("\n📋 Detailed Results:")
                        for i, det in enumerate(sorted(filtered_dets, key=lambda x: x["score"], reverse=True)[:max_dets]):
                            box = det["box"]
                            print(f"  {i+1:2d}. {det['label']:<20} {det['score']:.3f} "
                                  f"[{box['xmin']:.0f},{box['ymin']:.0f},{box['xmax']:.0f},{box['ymax']:.0f}]")
                    
                except Exception as e:
                    print(f"❌ Error: {e}")
        
        # Connect button to function
        run_button.on_click(run_detection)
        
        # Layout
        controls = widgets.VBox([
            widgets.HBox([image_dropdown, model_dropdown]),
            widgets.HBox([threshold_slider, max_detections_slider]),
            run_button
        ])
        
        display(controls, output)
        
        return controls, output

print("Interactive detector ready!")


Interactive detector ready!


In [None]:
# 🎯 MAIN INTERACTIVE DETECTION CELL
# Configure your settings below and run this cell

# === CONFIGURATION ===
# Add multiple image paths here
IMAGE_PATHS = [
    "/home/ut-ai/ai-works/adaptibot/yolo_detect/processed/images/Explorer_HD2K_SN36949228_15-51-04_R.png",
]

# Auto-discover more images in the processed/images directory
processed_images_dir = "/home/ut-ai/ai-works/adaptibot/yolo_detect/processed/images/"
if os.path.exists(processed_images_dir):
    for filename in sorted(os.listdir(processed_images_dir)):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            full_path = os.path.join(processed_images_dir, filename)
            if full_path not in IMAGE_PATHS:
                IMAGE_PATHS.append(full_path)

# Optimized search terms (focused on what's likely in the image)
SEARCH_TERMS = [
    # Generic terms (work best)
    "toy", "object", "container", "item",
    
    # Specific objects
    "can", "soda can", "aluminum can", "beverage can", "cylinder",
    "duck", "rubber duck", "toy duck", "yellow duck", "bath duck",
    "cup", "mug", "glass", "drinking vessel",
    "sponge", "cleaning sponge", "rectangular object",
    "ball", "round ball", "sphere", "toy ball",
    "vegetable", "fruit", "food item"
]

print(f"🎯 Interactive Detection Interface")
print(f"📁 Available images: {len(IMAGE_PATHS)} images")
for i, path in enumerate(IMAGE_PATHS[:5]):  # Show first 5
    print(f"  {i+1}. {os.path.basename(path)}")
if len(IMAGE_PATHS) > 5:
    print(f"  ... and {len(IMAGE_PATHS)-5} more images")
    
print(f"🔍 Search terms: {len(SEARCH_TERMS)} terms")
print(f"🤖 Available models: {detector_manager.available_models}")

# Create and launch interactive detector
interactive_detector = InteractiveDetector(IMAGE_PATHS, SEARCH_TERMS)
controls, output = interactive_detector.create_interface()

print("\n✨ Instructions:")
print("1. Select an image from the dropdown")
print("2. Select a model from the dropdown")
print("3. Adjust the confidence threshold slider")
print("4. Set maximum detections limit")
print("5. Click 'Run Detection' to see results")
print("6. Try different images and models to compare performance!")


🎯 Interactive Detection Interface
📁 Available images: 2016 images
  1. Explorer_HD2K_SN36949228_15-51-04_R.png
  2. Explorer_HD2K_SN36949228_09-43-28_L.png
  3. Explorer_HD2K_SN36949228_09-43-28_R.png
  4. Explorer_HD2K_SN36949228_09-43-39_L.png
  5. Explorer_HD2K_SN36949228_09-43-39_R.png
  ... and 2011 more images
🔍 Search terms: 28 terms
🤖 Available models: ['google/owlvit-base-patch16', 'google/owlvit-base-patch32', 'google/owlvit-large-patch14', 'GroundingDINO']


VBox(children=(HBox(children=(Dropdown(description='Image:', options=(('Explorer_HD2K_SN36949228_15-51-04_R.pn…

Output()


✨ Instructions:
1. Select an image from the dropdown
2. Select a model from the dropdown
3. Adjust the confidence threshold slider
4. Set maximum detections limit
5. Click 'Run Detection' to see results
6. Try different images and models to compare performance!


: 