In [None]:
# ----------------------------------------
# Import Libraries
# ----------------------------------------
import cv2  # OpenCV for computer vision tasks
import numpy as np  # NumPy for numerical operations

# ----------------------------------------
# Define class labels the model can detect
# ----------------------------------------
# These are 21 classes (0 = background, not used)
CLASSES = [
    "background", "aeroplane", "bicycle", "bird", "boat",
    "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
    "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
    "sofa", "train", "tvmonitor"
]

# ----------------------------------------
# Assign unique RGB colors to each class
# Shape: (21, 3)
# Each row is a random (R, G, B) color
COLORS = np.random.uniform(
    low=0,       # Minimum RGB value
    high=255,    # Maximum RGB value
    size=(len(CLASSES), 3)  # 21 rows × 3 columns for RGB
)

# ----------------------------------------
# Load the Caffe model
# ----------------------------------------
# - prototxt: defines network architecture
# - caffemodel: contains learned weights
net = cv2.dnn.readNetFromCaffe(
    prototxt=r"MobileNetSSD_deploy.prototxt",
    caffeModel=r"MobileNetSSD_deploy.caffemodel"
)

# ----------------------------------------
# Open webcam stream (device 0 = default webcam)
cap = cv2.VideoCapture(0)  # Or you can pass a video file path here

# ----------------------------------------
# Loop through frames
while True:
    # Capture frame-by-frame from the webcam
    ret, frame = cap.read()  # ret = success flag, frame = current image

    if not ret:
        break  # Break the loop if frame not captured properly

    # ----------------------------------------
    # Create a blob from the image for DNN
    blob = cv2.dnn.blobFromImage(
        image=cv2.resize(frame, (300, 300)),  # Resize frame to 300x300
        scalefactor=1.0 / 127.5,              # Scale pixel values
        size=(300, 300),                      # Model's required input size
        mean=(127.5, 127.5, 127.5),           # Mean subtraction for normalization
        swapRB=False,                         # BGR → RGB conversion flag
        crop=False                            # Don't crop image
    )

    # Pass the blob to the network
    net.setInput(blob)  # Sets blob as input to model

    # Forward pass to get output detections
    detections = net.forward()  # Shape: [1, 1, N, 7]  
    # 7 = [batch id, class id, confidence, xmin, ymin, xmax, ymax]
    # ----------------------------------------
    # Loop through each detection
    for i in range(detections.shape[2]):# detections[0, 0, i] = 1 detection
        #print(detections)
        confidence = detections[0, 0, i, 2]  # Index 2 = confidence (float)

        if confidence > 0.5:  # Only consider detections with > 50% confidence

            class_index = int(detections[0, 0, i, 1])  # Index 1 = class ID
            label = CLASSES[class_index]

            # Detection box values are ratios → scale to frame size
            box = detections[0, 0, i, 3:7] * np.array([
                frame.shape[1],  # Width
                frame.shape[0],  # Height
                frame.shape[1],  # Width
                frame.shape[0]   # Height
            ])
            (startX, startY, endX, endY) = box.astype("int")  # Bounding box coords

            # ----------------------------------------
            # Draw the bounding box
            cv2.rectangle(
                img=frame,                           # Image to draw on
                pt1=(startX, startY),                # Top-left corner
                pt2=(endX, endY),                    # Bottom-right corner
                color=COLORS[class_index],           # Color from our list
                thickness=2                          # Line thickness
            )

            # Prepare text label: "class: confidence"
            label_text = f"{label}: {confidence:.2f}"

            # Position text slightly above the box if space allows
            y = startY - 15 if (startY - 15) > 15 else (startY + 15)

            # ----------------------------------------
            # Draw the label text above the box
            cv2.putText(
                img=frame,                          # Frame to draw text on
                text=label_text,                    # Label string
                org=(startX, y),                    # Bottom-left corner of text
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,  # Font
                fontScale=0.5,                      # Font size
                color=COLORS[class_index],          # Font color
                thickness=2                         # Font thickness
            )

    # ----------------------------------------
    # Display the frame
    cv2.imshow("Real-Time Object Detection", frame)

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# ----------------------------------------
# Cleanup: release the camera and close window
cap.release()             # Stop webcam
cv2.destroyAllWindows()   # Close all OpenCV windows
