# Finetuning DINO

## Cloning the DINO repository and install dependencies

In [None]:
!git clone https://github.com/IDEA-Research/DINO.git /kaggle/working/DINO
%cd /kaggle/working/DINO
!pip install -r requirements.txt
!pip install -e .


## Imports and Dataset Definition

In [None]:
import os
import glob
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from PIL import Image

class YoloTrafficDataset(Dataset):
    def __init__(self, root, img_size=800, transforms=None):
        """
        Expects:
          root/
            images/*.jpg
            labels/*.txt  # YOLO format: class x_center y_center width height (normalized)
        """
        self.img_paths = sorted(glob.glob(os.path.join(root, "images", "*.jpg")))
        self.label_paths = [
            p.replace("images", "labels").replace(".jpg", ".txt")
            for p in self.img_paths
        ]
        self.img_size = img_size
        self.transforms = transforms or T.Compose([
            T.Resize((img_size, img_size)),
            T.ToTensor(),
            T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
        ])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img = Image.open(self.img_paths[idx]).convert("RGB")
        w, h = img.size
        boxes, labels = [], []
        with open(self.label_paths[idx], "r") as f:
            for line in f:
                cls, xc, yc, bw, bh = map(float, line.split())
                x1 = (xc - bw/2) * w
                y1 = (yc - bh/2) * h
                x2 = (xc + bw/2) * w
                y2 = (yc + bh/2) * h
                boxes.append([x1, y1, x2, y2])
                labels.append(int(cls))
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
        }
        img = self.transforms(img)
        return img, target


In [None]:
def collate_fn(batch):
    imgs, targets = zip(*batch)
    return torch.stack(imgs), list(targets)


## Training

In [None]:
from dino.models import build_model
from dino.config import get_config

def train_dino(
    data_root,
    config_file,
    pretrained_checkpoint,
    output_dir,
    img_size=800,
    batch_size=4,
    lr=2e-5,
    weight_decay=1e-4,
    lr_step=8,
    lr_gamma=0.1,
    epochs=15,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Datasets and loaders
    train_ds = YoloTrafficDataset(data_root, img_size=img_size)
    val_ds   = YoloTrafficDataset(data_root, img_size=img_size)
    train_loader = DataLoader(
        train_ds, batch_size=batch_size, shuffle=True,
        collate_fn=collate_fn, num_workers=4
    )
    val_loader = DataLoader(
        val_ds, batch_size=batch_size, shuffle=False,
        collate_fn=collate_fn, num_workers=4
    )

    # Build and load DINO
    cfg = get_config()
    cfg.merge_from_file(config_file)
    model, criterion, postprocessors = build_model(cfg)
    model.to(device)
    ckpt = torch.load(pretrained_checkpoint, map_location="cpu")
    model.load_state_dict(ckpt["model"], strict=False)

    # Optimizer and scheduler
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=lr, weight_decay=weight_decay
    )
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=lr_step, gamma=lr_gamma
    )

    os.makedirs(output_dir, exist_ok=True)
    for epoch in range(epochs):
        model.train()
        for imgs, targets in train_loader:
            imgs = imgs.to(device)
            targ = [{k: v.to(device) for k, v in t.items()} for t in targets]
            outputs = model(imgs, targ)
            loss = sum(outputs["losses"].values())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()

        # Optional validation pass
        model.eval()
        with torch.no_grad():
            for imgs, _ in val_loader:
                _ = model(imgs.to(device))

        torch.save(
            {"model": model.state_dict()},
            os.path.join(output_dir, f"checkpoint_{epoch}.pth")
        )
    print("Fine-tuning complete.")


In [None]:
train_dino(
    data_root="/kaggle/input/yolo-dataset/dataset_1/dataset_1/data.yaml", # we have similarly run for dataset part 2 and 3
    config_file="configs/DINO/dino_4scale_12ep_res50.yaml",
    pretrained_checkpoint="checkpoints/dino_res50_12ep.pth",
    output_dir="/kaggle/working/",
    img_size=800,
    batch_size=4,
    lr=2e-5,
    weight_decay=1e-4,
    lr_step=8,
    lr_gamma=0.1,
    epochs=15,
)
