In [None]:
import torch
from transformers import DetrImageProcessor, DetrForObjectDetection
import cv2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available

In [1]:
# Load the processor and model
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)

# Set the model to evaluation mode
model.eval()

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): FeatureListNet(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): DetrFrozenBatchNorm2d()
          (act1): ReLU(inplace=True)
          (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (layer1): Sequential(
            (0): Bottleneck(
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn1): DetrFrozenBatchNorm2d()
              (act1): ReLU(inplace=True)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn2): DetrFrozenBatchNorm2d()
              (drop_block): Identity()
              (act2): ReLU(inplace=True)
              (aa): Identity()
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

# Baseline model

In [2]:
# Function to draw bounding boxes on the image
def draw_boxes(image, boxes, scores, labels):
    for box, score, label in zip(boxes, scores, labels):
        if score > 0.5:  # Filter out low-confidence detections
            x1, y1, x2, y2 = box
            cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
            cv2.putText(image, f"{model.config.id2label[label.item()]}: {score:.2f}", 
                        (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
    return image

# Open the video
video_path = 'video/solidWhiteRight.mp4'
cap = cv2.VideoCapture(video_path)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Process the image
    inputs = processor(frame, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    # Get bounding boxes, scores, and labels
    target_sizes = torch.tensor([frame.shape[:2]])  # Get original image size
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    # Draw the bounding boxes on the original frame
    frame = draw_boxes(frame, results["boxes"], results["scores"], results["labels"])

    # Display the frame with detections
    cv2.imshow('DETR Object Detection', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

# OOD Detection with MSP

In [None]:
# Step 1: Open the video file
video_path = "video/solidWhiteRight.mp4" 
cap = cv2.VideoCapture(video_path)

# Check if the video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get video properties for output
fps = cap.get(cv2.CAP_PROP_FPS)  # Frames per second
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Step 2: Create VideoWriter to save the output video
output_path = "output\output_msp_video.mp4"  
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define a threshold for OOD detection
ood_threshold = 0.5

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Exit the loop if the video has ended

    # Step 3: Prepare the input frame
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB
    inputs = processor(images=image, return_tensors="pt")

    # Step 4: Perform object detection using DETR
    with torch.no_grad():
        outputs = model(**inputs)

    # Logits and bounding boxes
    logits = outputs.logits
    pred_boxes = outputs.pred_boxes  # Normalized bounding boxes (batch_size x num_queries x 4)

    # Step 5: Apply Maximum Softmax Probability for OOD detection
    softmax_probs = logits.softmax(dim=-1)
    max_softmax_probs, predicted_classes = softmax_probs.max(dim=-1)

    # Prepare for visualization
    for i in range(pred_boxes.shape[1]):
        # Bounding box coordinates (normalized to image dimensions)
        box = pred_boxes[0, i].cpu().numpy()
        x_center, y_center, box_width, box_height = box * [frame_width, frame_height, frame_width, frame_height]
        x_min = int(x_center - 0.5 * box_width)
        y_min = int(y_center - 0.5 * box_height)

        # Define class ID for "no-object"
        no_object_class_id = 91
        pred_class = predicted_classes[0][i]
        ood_flag = max_softmax_probs[0][i] < ood_threshold

        # Determine if the object is OOD or ID
        label = "OOD" if ood_flag else "ID"
        color = (0, 0, 255) if ood_flag else (0, 255, 0)  # Red for OOD, Green for ID

        if pred_class != no_object_class_id:  # Skip no-object predictions
            # Draw the bounding box
            cv2.rectangle(frame, (x_min, y_min), (x_min + int(box_width), y_min + int(box_height)), color, 2)
            # Annotate the object
            cv2.putText(frame, f'{label} ({max_softmax_probs[0][i]:.2f})', 
                        (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Step 6: Write the processed frame to the output video
    out.write(frame)

# Release the video capture and writer
cap.release()
out.release()
cv2.destroyAllWindows()


# OOD Detection with Energy Score

In [None]:
# Step 1: Open the video file
video_path = "video/solidWhiteRight.mp4"
cap = cv2.VideoCapture(video_path)

# Check if the video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get video properties for output
fps = cap.get(cv2.CAP_PROP_FPS)  # Frames per second
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Step 2: Create VideoWriter to save the output video
output_path = "output\output_energy_based_video.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define a threshold for OOD detection
energy_threshold = 1.0  # This value may need tuning

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Exit the loop if the video has ended

    # Step 3: Prepare the input frame
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB
    inputs = processor(images=image, return_tensors="pt").to(device)  # Move inputs to GPU

    # Step 4: Perform object detection using DETR
    with torch.no_grad():
        outputs = model(**inputs)

    # Logits and bounding boxes
    logits = outputs.logits
    pred_boxes = outputs.pred_boxes  # Normalized bounding boxes (batch_size x num_queries x 4)

    # Step 5: Calculate energy scores
    softmax_probs = logits.softmax(dim=-1)  # Calculate softmax probabilities
    energy_scores = -torch.log(softmax_probs.max(dim=-1).values)  # Negative log of the maximum softmax probability

    # Get the predicted class IDs
    predicted_classes = torch.argmax(softmax_probs, dim=-1)  # Get the class with the highest probability

    # Prepare for visualization
    for i in range(pred_boxes.shape[1]):
        # Bounding box coordinates (normalized to image dimensions)
        box = pred_boxes[0, i].cpu().numpy()  # Move bounding boxes to CPU
        x_center, y_center, box_width, box_height = box * [frame_width, frame_height, frame_width, frame_height]
        x_min = int(x_center - 0.5 * box_width)
        y_min = int(y_center - 0.5 * box_height)

        # Define class ID for "no-object"
        no_object_class_id = 91
        pred_class = predicted_classes[0][i].cpu().item()  # Move class ID to CPU
        energy_score = energy_scores[0][i].cpu().item()  # Get the energy score on CPU

        # Determine if the object is OOD or ID
        is_ood = energy_score > energy_threshold  # Classify as OOD if energy score exceeds the threshold
        label = "OOD" if is_ood else "ID"
        color = (0, 0, 255) if is_ood else (0, 255, 0)  # Red for OOD, Green for ID

        if pred_class != no_object_class_id:  # Skip no-object predictions
            # Draw the bounding box
            cv2.rectangle(frame, (x_min, y_min), (x_min + int(box_width), y_min + int(box_height)), color, 2)
            # Annotate the object
            cv2.putText(frame, f'{label} (Energy: {energy_score:.2f})', 
                        (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Step 6: Write the processed frame to the output video
    out.write(frame)

# Release the video capture and writer
cap.release()
out.release()
cv2.destroyAllWindows()
