### Object Dectection Using DETR ( Transformer Based )

Import dependencies

In [None]:
import os
import torchvision
from transformers import DetrImageProcessor
import os
import supervision as sv
import random
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import DetrForObjectDetection
import torch

Loading image data with a custom Dataset ( i.e. Coco)

In [None]:
image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
dataset = r'custom detr\dataset_path' # Replace with your dataset path

ANNOTATION_FILE_NAME = "annotation_file.json"
TRAIN_DIRECTORY = os.path.join(dataset, "train")
VAL_DIRECTORY = os.path.join(dataset, "valid")
TEST_DIRECTORY = os.path.join(dataset, "test")


class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self, 
        image_directory_path: str, 
        image_processor, 
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, ANNOTATION_FILE_NAME)
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)        
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target


TRAIN_DATASET = CocoDetection(image_directory_path=TRAIN_DIRECTORY, image_processor=image_processor, train=True)
VAL_DATASET = CocoDetection(image_directory_path=VAL_DIRECTORY, image_processor=image_processor, train=False)
TEST_DATASET = CocoDetection(image_directory_path=TEST_DIRECTORY, image_processor=image_processor, train=False)

print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
print("Number of test examples:", len(TEST_DATASET))

Find Number of Classes

In [None]:
categories = TRAIN_DATASET.coco.cats
id2label = {k: v['name'] for k,v in categories.items()}

Visualize an image

In [None]:
# Get random image ID from COCO dataset
image_ids = TRAIN_DATASET.coco.getImgIds()
image_id = random.choice(image_ids)
print(f'Image #{image_id}')

# Load image info and annotations
image_info = TRAIN_DATASET.coco.loadImgs(image_id)[0]
annotations = TRAIN_DATASET.coco.imgToAnns[image_id]

# Load image using OpenCV
image_path = os.path.join(TRAIN_DATASET.root, image_info['file_name'])
image = cv2.imread(image_path)

# Convert COCO bbox to xyxy and get class IDs
xyxy = []
class_ids = []

for ann in annotations:
    x, y, w, h = ann['bbox']
    xyxy.append([x, y, x + w, y + h])  # COCO format is [x, y, width, height]
    class_ids.append(ann['category_id'])

# Build class ID to label mapping
categories = TRAIN_DATASET.coco.cats
id2label = {k: v['name'] for k, v in categories.items()}
labels = [id2label[class_id] for class_id in class_ids]

# Create Detections object with labels
detections = sv.Detections(
    xyxy=np.array(xyxy),
    class_id=np.array(class_ids),
    data={"labels": labels}
)

# Annotate image with bounding boxes and labels
box_annotator = sv.BoxAnnotator()
annotated_image = box_annotator.annotate(scene=image.copy(), detections=detections)

# Convert BGR to RGB for display with matplotlib
image_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)

# Show image
plt.figure(figsize=(8, 8))
plt.imshow(image_rgb)
plt.axis('off')
plt.title(f"Image #{image_id}")
plt.show()

Turn custom loaded images into DataLoader's

In [None]:
def collate_fn(batch):
    pixel_values = [item[0] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET, collate_fn=collate_fn, batch_size=4, shuffle=True)
VAL_DATALOADER = DataLoader(dataset=VAL_DATASET, collate_fn=collate_fn, batch_size=4)
TEST_DATALOADER = DataLoader(dataset=TEST_DATASET, collate_fn=collate_fn, batch_size=4)

Model: DETR directly predicts (in parallel) the final set of detections by combining a common CNN with a Transformer architecture

In [None]:
class Detr(pl.LightningModule):

    def __init__(self, lr, lr_backbone, weight_decay):
        super().__init__()
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path="facebook/detr-resnet-50", 
            num_labels=len(id2label),
            ignore_mismatched_sizes=True
        )
        
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay

    def forward(self, pixel_values, pixel_mask):
        return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    def common_step(self, batch, batch_idx):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

        loss = outputs.loss
        loss_dict = outputs.loss_dict

        return loss, loss_dict

    def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step, and the average across the epoch
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
            self.log("train_" + k, v.item())

        return loss

    def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)     
        self.log("validation/loss", loss)
        for k, v in loss_dict.items():
            self.log("validation_" + k, v.item())
            
        return loss

    def configure_optimizers(self):
        param_dicts = [
            {
                "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
            {
                "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                "lr": self.lr_backbone,
            },
        ]
        return torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

    def train_dataloader(self):
        return TRAIN_DATALOADER

    def val_dataloader(self):
        return VAL_DATALOADER

Initializes a DETR model and runs a training batch through it to get object detection predictions.

In [None]:
model = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)

batch = next(iter(TRAIN_DATALOADER))
outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])

In [None]:
# from transformers import DetrForObjectDetection
# import torch

# # Instantiate model correctly (NOT using meta device)
# model = DetrForObjectDetection.from_pretrained(
#     "facebook/detr-resnet-50",  # or your fine-tuned checkpoint
# )

# # Put model on correct device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# model.train()  # if you're training

# # Get batch
# batch = next(iter(TRAIN_DATALOADER))
# pixel_values = batch['pixel_values'].to(device)
# pixel_mask = batch['pixel_mask'].to(device)

# # Forward pass
# outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)


In [None]:
torch.set_float32_matmul_precision('medium')  # Enable Tensor Cores for speed

Set up a PyTorch Lightning Trainer to train the model

In [None]:
from pytorch_lightning import Trainer

MAX_EPOCHS = 100

trainer = Trainer( max_epochs=MAX_EPOCHS, gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5)

trainer.fit(model)

Save trained model

In [None]:
MODEL_PATH = 'custom-model'
model.model.save_pretrained(MODEL_PATH)

In [None]:
from transformers import DetrImageProcessor

# Load base processor (from pretrained or your original)
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

processor.save_pretrained("custom-model")

Load trained model

In [None]:
# loading model
import torch
MODEL_PATH = 'custom-model'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = DetrForObjectDetection.from_pretrained(MODEL_PATH)
model.to(DEVICE)

Generate random predictions using the trained model, evaluate the results, and save the predictions

In [None]:
import os
import torch
from torchvision.ops import nms
from transformers import DetrForObjectDetection, DetrImageProcessor
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# === Configuration ===
MODEL_PATH = 'custom-model'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
IMAGE_DIR = r'dataset_path\test'  # Replace with your test images directory
OUTPUT_DIR = os.path.join(IMAGE_DIR, "output")
SCORE_THRESHOLD = 0.5
NMS_THRESHOLD = 0.3

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Load COCO-style label map from TEST_DATASET ===
categories = TEST_DATASET.coco.cats
id2label = {k: v['name'] for k, v in categories.items()}

# === Load model and processor ===
processor = DetrImageProcessor.from_pretrained(MODEL_PATH)
model = DetrForObjectDetection.from_pretrained(MODEL_PATH)
model.to(DEVICE)
model.eval()

# === Get all image files in the directory ===
image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff')
image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(image_extensions)]

if not image_files:
    print(f"No images found in {IMAGE_DIR}")
else:
    for img_name in image_files:
        img_path = os.path.join(IMAGE_DIR, img_name)
        print(f"\nProcessing: {img_name}")

        # === Load and preprocess image ===
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(DEVICE)

        # === Inference ===
        with torch.no_grad():
            outputs = model(**inputs)

        # === Post-processing ===
        results = processor.post_process_object_detection(outputs, target_sizes=[image.size[::-1]], threshold=SCORE_THRESHOLD)
        result = results[0]

        if len(result["scores"]) == 0:
            print("No objects detected.")
            continue

        # === Extract predictions ===
        boxes = result["boxes"]
        scores = result["scores"]
        labels = result["labels"]

        # === Apply NMS ===
        keep_indices = nms(boxes, scores, NMS_THRESHOLD)
        boxes = boxes[keep_indices]
        scores = scores[keep_indices]
        labels = labels[keep_indices]

        # === Print detections ===
        num_labels = len(id2label)
        colors = plt.colormaps['tab20'].resampled(num_labels)

        for score, label, box in zip(scores, labels, boxes):
            label_name = id2label.get(label.item(), f"Label_{label.item()}")
            print(f"Label: {label_name}, Score: {score.item():.3f}, Box: {box.tolist()}")

       # === Visualization (Corrected) ===
        # Get image dimensions (in pixels)
        img_width, img_height = image.size
        dpi = 100  # Choose a DPI value
        figsize = (img_width / dpi, img_height / dpi)

        # Create figure matching the image size
        fig, ax = plt.subplots(1, figsize=figsize, dpi=dpi)
        ax.imshow(image)
        ax.set_axis_off()  # Remove axes completely

        # Draw bounding boxes and labels
        for score, label, box in zip(scores, labels, boxes):
            xmin = box[0].cpu().item()
            ymin = box[1].cpu().item()
            xmax = box[2].cpu().item()
            ymax = box[3].cpu().item()

            width, height = xmax - xmin, ymax - ymin
            label_name = id2label.get(label.item(), f"Label_{label.item()}")
            color = colors(label.item())

            rect = patches.Rectangle((xmin, ymin), width, height, linewidth=2,
                                    edgecolor=color, facecolor='none')
            ax.add_patch(rect)

            ax.text(xmin, ymin - 10, f'{label_name}: {score:.2f}',
                    fontsize=12, color='white',
                    bbox=dict(facecolor=color, alpha=0.7, pad=2))

        # === Save output image (no whitespace) ===
        output_img_path = os.path.join(OUTPUT_DIR, f"{os.path.splitext(img_name)[0]}_pred.jpg")
        fig.savefig(output_img_path, dpi=dpi, bbox_inches='tight', pad_inches=0)
        plt.close(fig)
        print(f"Saved to: {output_img_path}")


Raw Visualization

In [None]:
# import os
# import torch
# from torchvision.ops import nms
# from transformers import DetrForObjectDetection, DetrImageProcessor
# from PIL import Image
# import matplotlib.pyplot as plt
# import matplotlib.patches as patches

# # === Configuration ===
# MODEL_PATH = 'custom-model'
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# IMAGE_DIR = r'dataset_path\test'  # Replace with your test images directory
# SCORE_THRESHOLD = 0.5
# NMS_THRESHOLD = 0.3

# # === Load COCO-style label map from TEST_DATASET ===
# categories = TEST_DATASET.coco.cats
# id2label = {k: v['name'] for k, v in categories.items()}

# # === Load model and processor ===
# processor = DetrImageProcessor.from_pretrained(MODEL_PATH)
# model = DetrForObjectDetection.from_pretrained(MODEL_PATH)
# model.to(DEVICE)
# model.eval()

# # === Get all image files in the directory ===
# image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff')
# image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(image_extensions)]

# if not image_files:
#     print(f"No images found in {IMAGE_DIR}")
# else:
#     for img_name in image_files:
#         img_path = os.path.join(IMAGE_DIR, img_name)
#         print(f"\nProcessing: {img_name}")

#         # === Load and preprocess image ===
#         image = Image.open(img_path).convert("RGB")
#         inputs = processor(images=image, return_tensors="pt").to(DEVICE)

#         # === Inference ===
#         with torch.no_grad():
#             outputs = model(**inputs)

#         # === Post-processing ===
#         results = processor.post_process_object_detection(outputs, target_sizes=[image.size[::-1]], threshold=SCORE_THRESHOLD)
#         result = results[0]

#         if len(result["scores"]) == 0:
#             print("No objects detected.")
#             continue

#         # === Extract predictions ===
#         boxes = result["boxes"]
#         scores = result["scores"]
#         labels = result["labels"]

#         # === Apply NMS ===
#         keep_indices = nms(boxes, scores, NMS_THRESHOLD)
#         boxes = boxes[keep_indices]
#         scores = scores[keep_indices]
#         labels = labels[keep_indices]

#         # === Print detections ===
#         num_labels = len(id2label)
#         colors = plt.colormaps['tab20'].resampled(num_labels)

#         for score, label, box in zip(scores, labels, boxes):
#             label_name = id2label.get(label.item(), f"Label_{label.item()}")
#             print(f"Label: {label_name}, Score: {score.item():.3f}, Box: {box.tolist()}")

#         # === Visualization ===
#         fig, ax = plt.subplots(1, figsize=(12, 8))
#         ax.imshow(image)

#         for score, label, box in zip(scores, labels, boxes):
#             xmin = box[0].cpu().item()
#             ymin = box[1].cpu().item()
#             xmax = box[2].cpu().item()
#             ymax = box[3].cpu().item()

#             width, height = xmax - xmin, ymax - ymin
#             label_name = id2label.get(label.item(), f"Label_{label.item()}")
#             color = colors(label.item())

#             rect = patches.Rectangle((xmin, ymin), width, height, linewidth=2,
#                                      edgecolor=color, facecolor='none')
#             ax.add_patch(rect)

#             ax.text(xmin, ymin - 10, f'{label_name}: {score:.2f}',
#                     fontsize=12, color='white', bbox=dict(facecolor=color, alpha=0.7, pad=2))

#         # plt.axis('off')
#         # plt.tight_layout()
#         # plt.show()
