In [1]:
model_ckpt = "hustvl/yolos-base"

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")
device = "cuda" if torch.cuda.is_available() else "cpu"

No GPU available. Training will run on CPU.


In [4]:
from transformers import YolosImageProcessor, YolosForObjectDetection, YolosFeatureExtractor

model = YolosForObjectDetection.from_pretrained(model_ckpt)
image_processor = YolosImageProcessor.from_pretrained(model_ckpt)
feature_extractor = YolosFeatureExtractor.from_pretrained(model_ckpt)



In [5]:
import os
import xml.etree.ElementTree as ET
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as T
import torch


class RobotDataset(Dataset):
    def __init__(self, images_dir, xml_dir, transform=None):
        self.images_dir = images_dir
        self.xml_dir = xml_dir
        self.transform = transform
        self.image_files = [
            f for f in os.listdir(images_dir) if f.endswith((".jpg", ".png", ".jpeg"))
        ]
        self.lab2id = {"Left_tool":0, "Right_tool":1}

    def parse_xml(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()
        boxes = []
        labels = []

        for obj in root.findall("object"):
            label = obj.find("pose").text + "_" + obj.find("name").text
            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            ymin = int(bbox.find("ymin").text)
            xmax = int(bbox.find("xmax").text)
            ymax = int(bbox.find("ymax").text)
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(label)

        return boxes, labels

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # Load image
        image_filename = self.image_files[idx]
        image_path = os.path.join(self.images_dir, image_filename)
        image = Image.open(image_path).convert("RGB")

        # Load and parse XML annotation
        xml_filename = os.path.splitext(image_filename)[0] + ".xml"
        xml_path = os.path.join(self.xml_dir, xml_filename)
        boxes, labels = self.parse_xml(xml_path)
        labels = [self.lab2id[lab] for lab in labels]

        if self.transform:
            image = self.transform(image)


        return image, boxes, labels

In [6]:
from pathlib import Path

images_path = Path("ATLAS_Dione_ObjectDetection/JPEGImages")
xml_path = Path("ATLAS_Dione_ObjectDetection/Annotations")
transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor()
])

In [7]:
# This function avoids error when loading a batch with different sized labels lists
def collate_fn(batch):

    images = [item[0]["pixel_values"] for item in batch]
    boxes = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    return {"pixel_values": images, "labels": {"class_labels": labels, "boxes": boxes}}

In [8]:
# Define the DataLoader with the custom collate_fn
dataset = RobotDataset(images_dir = images_path, xml_dir=xml_path, transform=image_processor) 
# data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
train_loader, test_loader = torch.utils.data.random_split(dataset, [0.8, 0.2])

In [9]:
from transformers import TrainingArguments, Trainer

model_name = model_ckpt.split("/")[-1]
new_model_name = f"{model_name}-finetuned"
num_epochs = 4
batch_size = 8

training_args = TrainingArguments(
    model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=4,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",  # Changed from accuracy since this is object detection
)



In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_loader,
    eval_dataset=test_loader,
    processing_class=feature_extractor,
    data_collator=collate_fn,
    )

In [11]:
train_results = trainer.train()

  0%|          | 0/71896 [00:00<?, ?it/s]

KeyError: (Ellipsis, slice(1, None, None))

In [None]:
trainer.evaluate(test_loader)

In [None]:
trainer.save_model()
test_results = trainer.evaluate(test_loader)
trainer.log_metrics("test", test_results)
trainer.save_metrics("test", test_results)
trainer.save_state()