In [1]:
from datasets import load_dataset
from transformers import AutoImageProcessor
import albumentations as A
from torch.utils.data import Dataset

# Load the dataset
ds = load_dataset("Francesco/animals-ij5d2")
train_ds = ds["train"]
val_ds = ds["validation"]
test_ds = ds["test"]
print(ds)
categories = train_ds.features["objects"].feature["category"].names
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}
print(categories)
checkpoint = "PekingU/rtdetr_r50vd_coco_o365"
image_size = 480

image_processor = AutoImageProcessor.from_pretrained(
    checkpoint,
    do_resize=True,
    size={"width": image_size, "height": image_size},
)

train_augmentation_and_transform = A.Compose(
    [
        A.Perspective(p=0.1),
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.5),
        A.HueSaturationValue(p=0.1),
        A.Rotate(limit=20, p=0.5),
        A.RandomScale(scale_limit=0.2, p=0.5),
        A.RandomResizedCrop(height=224, width=224, scale=(0.8, 1.0), ratio=(0.75, 1.33), p=0.5),
        A.Affine(scale=(0.9, 1.1), translate_percent=0.1, rotate=(-10, 10), shear=(-5, 5), p=0.5),
        A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),
        A.MotionBlur(blur_limit=3, p=0.2),
        A.GridDistortion(p=0.2),
    ],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)

# to make sure boxes are clipped to image size and there is no boxes with area < 1 pixel
validation_transform = A.Compose(
    [A.NoOp()],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=1),
)

class AnimalsDataset(Dataset):
    def __init__(self, dataset, image_processor, transform=None):
        self.dataset = dataset
        self.image_processor = image_processor
        self.transform = transform

    @staticmethod
    def format_image_annotations_as_coco(image_id, categories, boxes):
        """Format one set of image annotations to the COCO format

        Args:
            image_id (str): image id. e.g. "0001"
            categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
            boxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
                ([center_x, center_y, width, height] in absolute coordinates)

        Returns:
            dict: {
                "image_id": image id,
                "annotations": list of formatted annotations
            }
        """
        annotations = []
        for category, bbox in zip(categories, boxes):
            formatted_annotation = {
                "image_id": image_id,
                "category_id": category,
                "bbox": list(bbox),
                "iscrowd": 0,
                "area": bbox[2] * bbox[3],
            }
            annotations.append(formatted_annotation)

        return {
            "image_id": image_id,
            "annotations": annotations,
        }

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]

        image_id = sample["image_id"]
        image = sample["image"]
        boxes = sample["objects"]["bbox"]
        categories = sample["objects"]["category"]

        # Convert image to RGB numpy array
        image = np.array(image.convert("RGB"))

        # Apply augmentations
        if self.transform:
            transformed = self.transform(image=image, bboxes=boxes, category=categories)
            image = transformed["image"]
            boxes = transformed["bboxes"]
            categories = transformed["category"]

        # Format annotations in COCO format for image_processor
        formatted_annotations = self.format_image_annotations_as_coco(image_id, categories, boxes)

        # Apply the image processor transformations: resizing, rescaling, normalization
        result = self.image_processor(
            images=image, annotations=formatted_annotations, return_tensors="pt"
        )

        # Image processor expands batch dimension, lets squeeze it
        result = {k: v[0] for k, v in result.items()}

        return result

train_ds = AnimalsDataset(train_ds, image_processor, transform=train_augmentation_and_transform)
val_ds = AnimalsDataset(val_ds, image_processor, transform=validation_transform)
test_ds = AnimalsDataset(test_ds, image_processor, transform=validation_transform)

2024-11-04 10:39:45.278515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-04 10:39:45.295206: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-04 10:39:45.300462: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-04 10:39:45.313165: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:albumentations.check_version:A new version of Al

DatasetDict({
    train: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 700
    })
    validation: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 100
    })
    test: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 200
    })
})
['animals', 'cat', 'chicken', 'cow', 'dog', 'fox', 'goat', 'horse', 'person', 'racoon', 'skunk']


import numpy as np
from PIL import Image, ImageDraw

# Get mapping from category id to category name


print(categories)
print(id2label)
print(label2id)

# Load image and annotations
image = train_ds[2]["image"]
annotations = train_ds[2]["objects"]

# Draw bounding boxes and labels
draw = ImageDraw.Draw(image)
for i in range(len(annotations["id"])):
    box = annotations["bbox"][i]
    class_idx = annotations["category"][i]
    x, y, w, h = tuple(box)
    draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
    draw.text((x, y), id2label[class_idx], fill="white")

image




for i in [15, 16, 17]:
    image = train_ds[i]["image"]
    annotations = train_ds[i]["objects"]

    # Apply the augmentation
    output = train_augmentation_and_transform(image=np.array(image), bboxes=annotations["bbox"], category=annotations["category"])

    # Unpack the output
    image = Image.fromarray(output["image"])
    categories, boxes = output["category"], output["bboxes"]

    # Draw the augmented image
    draw = ImageDraw.Draw(image)
    for category, box in zip(categories, boxes):
        x, y, w, h = box
        draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
        draw.text((x, y), id2label[category], fill="white")
    image.show()

for i in [15, 16, 17]:
    sample = train_ds[i]

    # De-normalize image
    image = sample["pixel_values"]
    print("Image tensor shape:", image.shape)
    image = image.numpy().transpose(1, 2, 0)
    image = (image - image.min()) / (image.max() - image.min()) * 255.
    image = Image.fromarray(image.astype(np.uint8))

    # Convert boxes from [center_x, center_y, width, height] to [x, y, width, height] for visualization
    boxes = sample["labels"]["boxes"].numpy()
    print("Boxes shape:", boxes.shape)
    boxes[:, :2] = boxes[:, :2] - boxes[:, 2:] / 2
    w, h = image.size
    boxes = boxes * np.array([w, h, w, h])[None]

    categories = sample["labels"]["class_labels"].numpy()
    print("Categories shape:", categories.shape)

    # Draw boxes and labels on image
    draw = ImageDraw.Draw(image)
    for box, category in zip(boxes, categories):
        x, y, w, h = box
        draw.rectangle([x, y, x + w, y + h], outline="red", width=1)
        draw.text((x, y), id2label[category], fill="white")
    image.show()

In [2]:
from transformers.image_transforms import center_to_corners_format
import torch
import numpy as np
from dataclasses import dataclass
from torchmetrics.detection.mean_ap import MeanAveragePrecision

def collate_fn(batch):
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    return data

def convert_bbox_yolo_to_pascal(boxes, image_size):
    """
    Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
    to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.

    Args:
        boxes (torch.Tensor): Bounding boxes in YOLO format
        image_size (Tuple[int, int]): Image size in format (height, width)

    Returns:
        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
    """
    # convert center to corners format
    boxes = center_to_corners_format(boxes)

    # convert to absolute coordinates
    height, width = image_size
    boxes = boxes * torch.tensor([[width, height, width, height]])

    return boxes

@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


class MAPEvaluator:

    def __init__(self, image_processor, threshold=0.00, id2label=None):
        self.image_processor = image_processor
        self.threshold = threshold
        self.id2label = id2label

    def collect_image_sizes(self, targets):
        """Collect image sizes across the dataset as list of tensors with shape [batch_size, 2]."""
        image_sizes = []
        for batch in targets:
            batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch]))
            image_sizes.append(batch_image_sizes)
        return image_sizes

    def collect_targets(self, targets, image_sizes):
        post_processed_targets = []
        for target_batch, image_size_batch in zip(targets, image_sizes):
            for target, size in zip(target_batch, image_size_batch):
                boxes = torch.tensor(target["boxes"])
                boxes = convert_bbox_yolo_to_pascal(boxes, size)
                labels = torch.tensor(target["class_labels"])
                post_processed_targets.append({"boxes": boxes, "labels": labels})
        return post_processed_targets

    def collect_predictions(self, predictions, image_sizes):
        post_processed_predictions = []
        for batch, target_sizes in zip(predictions, image_sizes):
            batch_logits, batch_boxes = batch[1], batch[2]
            output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
            post_processed_output = self.image_processor.post_process_object_detection(
                output, threshold=self.threshold, target_sizes=target_sizes
            )
            post_processed_predictions.extend(post_processed_output)
        return post_processed_predictions

    @torch.no_grad()
    def __call__(self, evaluation_results):

        predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

        image_sizes = self.collect_image_sizes(targets)
        post_processed_targets = self.collect_targets(targets, image_sizes)
        post_processed_predictions = self.collect_predictions(predictions, image_sizes)

        evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
        evaluator.warn_on_many_detections = False
        evaluator.update(post_processed_predictions, post_processed_targets)

        metrics = evaluator.compute()

        # Replace list of per class metrics with separate metric for each class
        classes = metrics.pop("classes")
        map_per_class = metrics.pop("map_per_class")
        mar_100_per_class = metrics.pop("mar_100_per_class")
        for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
            class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
            metrics[f"map_{class_name}"] = class_map
            metrics[f"mar_100_{class_name}"] = class_mar

        metrics = {k: round(v.item(), 4) for k, v in metrics.items()}

        return metrics

eval_compute_metrics_fn = MAPEvaluator(image_processor=image_processor, threshold=0.01, id2label=id2label)

In [3]:
from transformers import TrainingArguments, Trainer, AutoModelForObjectDetection

model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    anchor_image_size=None,
    ignore_mismatched_sizes=True,
)

training_args = TrainingArguments(
    output_dir="rtdetr_model",
    num_train_epochs=100,
    max_grad_norm=0.1,
    learning_rate=1e-5,
    optim="adamw_torch_fused",
    weight_decay=0.10,
    gradient_accumulation_steps=2,
    lr_scheduler_type="cosine", 
    warmup_steps=300,
    per_device_train_batch_size=8,
    dataloader_num_workers=6,
    metric_for_best_model="eval_map",
    greater_is_better=True,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=4,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor,
    data_collator=collate_fn,
    compute_metrics=eval_compute_metrics_fn,
)

trainer.train()

Could not load the custom kernel for multi-scale deformable attention: Error building extension 'MultiScaleDeformableAttention': [1/2] /usr/local/cuda-11.5/bin/nvcc --generate-dependencies-with-compile --dependency-output ms_deform_attn_cuda.cuda.o.d -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/eagle/anaconda3/lib/python3.11/site-packages/transformers/kernels/deformable_detr -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/TH -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/THC -isystem /usr/local/cuda-11.5/include -isystem /home/eagle/anaconda3/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ 

Epoch,Training Loss,Validation Loss,Map,Map 50,Map 75,Map Small,Map Medium,Map Large,Mar 1,Mar 10,Mar 100,Mar Small,Mar Medium,Mar Large,Map Animals,Mar 100 Animals,Map Cat,Mar 100 Cat,Map Chicken,Mar 100 Chicken,Map Cow,Mar 100 Cow,Map Dog,Mar 100 Dog,Map Fox,Mar 100 Fox,Map Goat,Mar 100 Goat,Map Horse,Mar 100 Horse,Map Person,Mar 100 Person,Map Racoon,Mar 100 Racoon,Map Skunk,Mar 100 Skunk
1,No log,174.083694,0.0243,0.0291,0.0275,0.0,0.0,0.0432,0.0916,0.1365,0.2133,0.0,0.0,0.2146,-1.0,-1.0,0.0261,0.6143,0.0203,0.1867,0.0,0.0524,0.1856,0.4412,0.0094,0.4889,0.0,0.0,0.0002,0.2125,0.0,0.024,0.0,0.0,0.0018,0.1133
2,No log,146.844086,0.0274,0.0318,0.0297,0.0,0.0,0.0537,0.1268,0.2097,0.2392,0.0,0.0,0.2399,-1.0,-1.0,0.0415,0.6571,0.0343,0.2267,0.0,0.0286,0.1861,0.5588,0.0103,0.5667,0.0,0.0,0.0005,0.2625,0.0,0.032,0.0,0.0,0.0009,0.06
3,No log,109.64447,0.0475,0.0555,0.0513,0.0,0.0,0.0637,0.1447,0.2494,0.2836,0.0,0.0,0.2858,-1.0,-1.0,0.1219,0.5571,0.0351,0.2333,0.0001,0.0905,0.2881,0.5941,0.0184,0.7111,0.0,0.0,0.0005,0.1875,0.0003,0.116,0.0,0.0,0.0101,0.3467
4,No log,88.634216,0.0887,0.1018,0.1,0.0,0.0,0.1029,0.2067,0.3405,0.3821,0.0,0.0,0.3843,-1.0,-1.0,0.5081,0.7,0.0073,0.4133,0.0016,0.0952,0.2865,0.6059,0.0501,0.6889,0.0004,0.0586,0.0018,0.4875,0.0006,0.16,0.0003,0.1182,0.0307,0.4933
5,No log,68.97081,0.1084,0.1234,0.1155,0.0,0.0,0.1249,0.3487,0.5595,0.6115,0.0,0.0,0.6231,-1.0,-1.0,0.3405,0.9286,0.0107,0.64,0.0042,0.4905,0.4704,0.8529,0.0528,0.8222,0.0069,0.2621,0.0115,0.5875,0.0837,0.72,0.0008,0.1182,0.1025,0.6933
6,No log,53.694229,0.1697,0.1954,0.1831,0.0,0.0007,0.1792,0.4864,0.7032,0.7692,0.0,0.1667,0.7836,-1.0,-1.0,0.2775,0.9143,0.0277,0.9267,0.0132,0.7381,0.5225,0.9353,0.2497,0.8778,0.0362,0.5931,0.046,0.725,0.3449,0.768,0.0022,0.3273,0.1773,0.8867
7,No log,40.944603,0.2398,0.2829,0.2597,0.0,0.0,0.2492,0.5515,0.7479,0.7957,0.0,0.0,0.8124,-1.0,-1.0,0.3859,0.9,0.0604,0.8733,0.0408,0.7095,0.5882,0.9118,0.2724,0.8667,0.1232,0.7621,0.0832,0.8,0.6404,0.808,0.0045,0.3727,0.1994,0.9533
8,No log,32.074699,0.3166,0.3788,0.3489,0.0,0.0,0.3373,0.5597,0.7865,0.8348,0.0,0.0,0.8519,-1.0,-1.0,0.5539,0.9143,0.0893,0.9,0.1977,0.7286,0.6546,0.9353,0.4147,0.9333,0.1329,0.7379,0.1392,0.7875,0.7203,0.844,0.0101,0.6,0.2534,0.9667
9,No log,26.869062,0.409,0.4839,0.4436,0.0,0.0,0.434,0.6051,0.8357,0.8689,0.0,0.0,0.8864,-1.0,-1.0,0.6102,0.9143,0.3643,0.88,0.3883,0.7429,0.6746,0.9235,0.5027,0.9222,0.1967,0.869,0.1452,0.7875,0.7158,0.86,0.0432,0.8364,0.449,0.9533
10,No log,24.118023,0.4777,0.5829,0.527,0.0,0.0168,0.5001,0.5957,0.8152,0.8546,0.0,0.0667,0.8707,-1.0,-1.0,0.809,0.9286,0.3023,0.8867,0.4231,0.7333,0.6889,0.9294,0.6007,0.8667,0.2063,0.8552,0.174,0.7625,0.7187,0.828,0.1505,0.8091,0.7033,0.9467


There were missing keys in the checkpoint model loaded: ['class_embed.0.weight', 'class_embed.0.bias', 'class_embed.1.weight', 'class_embed.1.bias', 'class_embed.2.weight', 'class_embed.2.bias', 'class_embed.3.weight', 'class_embed.3.bias', 'class_embed.4.weight', 'class_embed.4.bias', 'class_embed.5.weight', 'class_embed.5.bias', 'bbox_embed.0.layers.0.weight', 'bbox_embed.0.layers.0.bias', 'bbox_embed.0.layers.1.weight', 'bbox_embed.0.layers.1.bias', 'bbox_embed.0.layers.2.weight', 'bbox_embed.0.layers.2.bias', 'bbox_embed.1.layers.0.weight', 'bbox_embed.1.layers.0.bias', 'bbox_embed.1.layers.1.weight', 'bbox_embed.1.layers.1.bias', 'bbox_embed.1.layers.2.weight', 'bbox_embed.1.layers.2.bias', 'bbox_embed.2.layers.0.weight', 'bbox_embed.2.layers.0.bias', 'bbox_embed.2.layers.1.weight', 'bbox_embed.2.layers.1.bias', 'bbox_embed.2.layers.2.weight', 'bbox_embed.2.layers.2.bias', 'bbox_embed.3.layers.0.weight', 'bbox_embed.3.layers.0.bias', 'bbox_embed.3.layers.1.weight', 'bbox_embed.3.l

TrainOutput(global_step=4400, training_loss=22.407134898792613, metrics={'train_runtime': 4330.7907, 'train_samples_per_second': 16.163, 'train_steps_per_second': 1.016, 'total_flos': 1.240948138752e+19, 'train_loss': 22.407134898792613, 'epoch': 100.0})

In [None]:
import torch
import requests
from PIL import Image, ImageDraw
from pprint import pprint

metrics = trainer.evaluate(eval_dataset=test_ds, metric_key_prefix="eval")
pprint(metrics)
device = "cuda"

image = Image.open("test.jpg")

inputs = image_processor(images=[image], return_tensors="pt")
inputs = inputs.to(device)
with torch.no_grad():
    outputs = model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])

result = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]

for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

image_with_boxes = image.copy()
draw = ImageDraw.Draw(image_with_boxes)

for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y), model.config.id2label[label.item()], fill="white")

image_with_boxes

In [12]:
model.save_pretrained("decent_model")
image_processor.save_pretrained("processor")

['processor/preprocessor_config.json']

In [2]:
import cv2
import torch
from transformers import AutoModelForObjectDetection, AutoImageProcessor
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(categories)
# Load the trained model and the image processor
checkpoint_path = "checkpoint-8750"
model = AutoModelForObjectDetection.from_pretrained(
    checkpoint_path,
    id2label=id2label,
    label2id=label2id,
    anchor_image_size=None,
    ignore_mismatched_sizes=True
)
processor = AutoImageProcessor.from_pretrained(checkpoint_path)
model.to(device)
# Open the camera (0 is the default camera, change if you have multiple cameras)
cap = cv2.VideoCapture(0)

# Set the camera resolution (optional)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 640)

while True:
    ret, frame = cap.read()  # Capture frame-by-frame

    if not ret:
        print("Failed to grab frame")
        break

    # Convert the frame to RGB (OpenCV uses BGR by default)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Preprocess the image
    inputs = processor(images=rgb_frame, return_tensors="pt").to(device)

    # Run inference on the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-process the outputs to get boxes, labels, and scores
    target_sizes = torch.tensor([rgb_frame.shape[:2]]).to(device)  # Use the original image size for scaling
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]

    # Extract the boxes, labels, and scores
    boxes = results['boxes'].cpu().numpy()  # Bounding boxes
    labels = results['labels'].cpu().numpy()  # Predicted class labels
    scores = results['scores'].cpu().numpy()  # Confidence scores

    # Draw the bounding boxes on the frame
    for box, label, score in zip(boxes, labels, scores):
        if score > 0.3:  # Only show boxes with a confidence score > 0.5
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)  # Draw rectangle

            # Display label and score
            label_text = f"{id2label[label]}: {score:.2f}"
            cv2.putText(frame, label_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Display the frame with bounding boxes
    cv2.imshow('Camera Feed - Object Detection', frame)

    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close the window
cap.release()
cv2.destroyAllWindows()

['animals', 'cat', 'chicken', 'cow', 'dog', 'fox', 'goat', 'horse', 'person', 'racoon', 'skunk']


Could not load the custom kernel for multi-scale deformable attention: Error building extension 'MultiScaleDeformableAttention': [1/2] /usr/local/cuda-11.5/bin/nvcc --generate-dependencies-with-compile --dependency-output ms_deform_attn_cuda.cuda.o.d -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/eagle/anaconda3/lib/python3.11/site-packages/transformers/kernels/deformable_detr -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/TH -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/THC -isystem /usr/local/cuda-11.5/include -isystem /home/eagle/anaconda3/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ 

KeyboardInterrupt: 

In [2]:
import cv2
import torch
import time
from transformers import AutoModelForObjectDetection, AutoImageProcessor
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model and the image processor
checkpoint_path = "checkpoint-8750"
model = AutoModelForObjectDetection.from_pretrained(
    checkpoint_path,
    id2label=id2label,
    label2id=label2id,
    anchor_image_size=None,
    ignore_mismatched_sizes=True
)
processor = AutoImageProcessor.from_pretrained(checkpoint_path)
model.to(device)

# Open the camera (0 is the default camera, change if you have multiple cameras)
cap = cv2.VideoCapture(0)

# Set the camera resolution (optional)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 640)

while True:
    start_time = time.time()  # Start time for FPS calculation
    
    ret, frame = cap.read()  # Capture frame-by-frame
    if not ret:
        print("Failed to grab frame")
        break

    # Convert the frame to RGB (OpenCV uses BGR by default)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Preprocess the image
    inputs = processor(images=rgb_frame, return_tensors="pt").to(device)

    # Run inference on the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-process the outputs to get boxes, labels, and scores
    target_sizes = torch.tensor([rgb_frame.shape[:2]]).to(device)  # Use the original image size for scaling
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]

    # Extract the boxes, labels, and scores
    boxes = results['boxes'].cpu().numpy()  # Bounding boxes
    labels = results['labels'].cpu().numpy()  # Predicted class labels
    scores = results['scores'].cpu().numpy()  # Confidence scores

    # Draw the bounding boxes on the frame
    for box, label, score in zip(boxes, labels, scores):
        if score > 0.3:  # Only show boxes with a confidence score > 0.3
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)  # Draw rectangle

            # Display label and score
            label_text = f"{id2label[label]}: {score:.2f}"
            cv2.putText(frame, label_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Calculate FPS
    end_time = time.time()
    fps = 1 / (end_time - start_time)
    cv2.putText(frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame with bounding boxes
    cv2.imshow('Camera Feed - Object Detection', frame)

    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close the window
cap.release()
cv2.destroyAllWindows()


Could not load the custom kernel for multi-scale deformable attention: Error building extension 'MultiScaleDeformableAttention': [1/2] /usr/local/cuda-11.5/bin/nvcc --generate-dependencies-with-compile --dependency-output ms_deform_attn_cuda.cuda.o.d -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/eagle/anaconda3/lib/python3.11/site-packages/transformers/kernels/deformable_detr -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/TH -isystem /home/eagle/anaconda3/lib/python3.11/site-packages/torch/include/THC -isystem /usr/local/cuda-11.5/include -isystem /home/eagle/anaconda3/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ 

In [5]:
print(model)

RTDetrForObjectDetection(
  (model): RTDetrModel(
    (backbone): RTDetrConvEncoder(
      (model): RTDetrResNetBackbone(
        (embedder): RTDetrResNetEmbeddings(
          (embedder): Sequential(
            (0): RTDetrResNetConvLayer(
              (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
              (normalization): RTDetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (1): RTDetrResNetConvLayer(
              (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (normalization): RTDetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (2): RTDetrResNetConvLayer(
              (convolution): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (normalization): RTDetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
          )
          (pooler): MaxPool2d(